Assignment

Assignment Brief:

Develop a component based web crawler and search engine using the Spring Framework, the
Apache HttpClient, the Jerry Http Parser, and the Lucene Search Engine. The majorcomponents / assemblies of the system are shown below. The dotted lines represent a looselycoupled link which is wired up using dependency injection and the Spring Framework.

The basic setup for this project will be supplied as a zip file on the moodle homepage will
comes with all libraries required and initial interface setup. You will need to create your own
interface and implementation for the Index component. You will also need to create your own
implementations of the HttpParser and HttpClient interfaces. It might also be a good idea to
define your own System Logging interface and components.

Web Crawler

The web crawler component is given a URI (Uniform Resource Identifier) as a starting point
and will proceed to:
1. Download URI content using the HttpClient component
2. Parse HTML content using the HttpParser component extracting a href links
3. The Spider component will store standardised versions of these links so that relative
links and other mistyped links are stored as an absolute link.
4. These standardised links will be added to a queue (use a LinkedList, and removeelements from the top of the list, adding new elements to the end of the list). You willneed to keep track of already visited URIs using a separate HashSet.
5. If a URI has already been downloaded do not add it again to the web crawlers queue.
6. Restart the crawling process recursively removing the next URI to be processed fromthe top of the queue.
7. Log any exceptions or broken links – do not let this stop the web crawling process.

Lucene Search Engine

Develop a simple Index interface and IndexImpl components using Lucene. The IndexImpl
will take the parsed content from a webpage, remove HTML tags, and will anaylse and index
the textual content of a webpage as a Lucene Document. The Lucene indexing component
will need to create one Lucene Document object per HTML document with two fields. The
first field, will represent the textual content of the webpage and the second field will define
the URI of the web document.
Use a Struts, JavaFx or Swing based application to test the search engine using anIndexSearcher to query the index, and retrieve results and use appropriate widgets to displaythe search engine results.

Optional
The current application uses one very specific type of HttpClient and HttpParser; extend the
application to allow the implementation of these interfaces to change. Develop newHttpClientImpl and HttpParserImpl concrete classes (with appropriate names) to enable the
use of different libraries.

Solution 

HttpClient.java 

packagecom.example.webCrowller;

importorg.jsoup.nodes.Document;

public interface HttpClient {

Document downloadDocument(String uri);

} 

HttpParser.java 

packagecom.example.webCrowller;

importjava.util.List;

importorg.jsoup.nodes.Document;

public interface HttpParser {

List<String>extractLinks(Document document);

} 

Index.java 

packagecom.example.webCrowller;

importorg.jsoup.nodes.Document;

importjava.util.List;

public interface Index {

voidaddDocument(Document document, String url);

List<String>search(String q);

} 

UserInterface.java 

packagecom.example.webCrowller;

importorg.springframework.context.ApplicationContext;

import org.springframework.context.support.ClassPathXmlApplicationContext;

importjavax.swing.*;

importjavax.swing.table.DefaultTableModel;

importjava.awt.*;

importjava.awt.event.MouseEvent;

importjava.awt.event.MouseListener;

importjava.net.URI;

importjava.util.ArrayList;

importjava.util.List;

public class UserInterface extends JFrame {

privateWebCrowlerwebCrowler;

privateJTextFieldsearchTf;

privateJLabelsearchLabel;

privateJButtonsearchButton;

privateJTablesearchResults;

privateJLabelindexingImage;

privateJLabelindexingLabel;

privateJLabellinkLabel;

private List<String> links = new ArrayList<>();

publicUserInterface() {

this.setTitle(“Search engine”);

this.setSize(700, 500);

this.setLayout(null);

this.searchButton = new JButton(“Search”);

this.searchLabel = new JLabel(“Search Text: “);

this.indexingImage = new JLabel(new ImageIcon(“giphy.gif”));

this.indexingLabel = new JLabel(“Indexing”);

this.searchTf = new JTextField();

this.linkLabel = new JLabel();

init();

setVisible(true);

}

publicUserInterface(WebCrowlerwebCrowler) {

this();

this.webCrowler = webCrowler;

}

private void init() {

this.searchLabel.setBounds(10, 10, 110, 20);

this.searchTf.setBounds(120, 10, 200, 30);

this.searchButton.setBounds(340, 10, 150, 30);

this.indexingLabel.setBounds(300, 250, 100, 30);

this.indexingImage.setBounds(300, 200, 50, 50);

this.linkLabel.setBounds(100, 270, 500, 50);

this.linkLabel.setForeground(Color.BLUE);

Object[][] data = new Object[][]{};

String[] header = {“Url”};

DefaultTableModel model = new DefaultTableModel(data, header);

this.searchResults = new JTable(model);

this.searchResults.addMouseListener(new MouseListener() {

@Override

public void mouseClicked(MouseEvent e) {

int row = searchResults.rowAtPoint(e.getPoint());

Desktop desktop = Desktop.isDesktopSupported() ? Desktop.getDesktop() : null;

if (desktop != null &&desktop.isSupported(Desktop.Action.BROWSE)) {

try {

desktop.browse(URI.create(links.get(row)));

} catch (Exception ignored) {

}

}

}

@Override

public void mousePressed(MouseEvent e) {

}

@Override

public void mouseReleased(MouseEvent e) {

}

@Override

public void mouseEntered(MouseEvent e) {

}

@Override

public void mouseExited(MouseEvent e) {

}

});

add(indexingLabel);

add(indexingImage);

add(searchLabel);

add(searchTf);

add(searchButton);

add(searchResults);

add(linkLabel);

this.searchButton.addActionListener(ev -> {

String searchText = searchTf.getText();

if (!searchText.isEmpty()) {

System.out.println(“Searching ” + searchText);

searching();

links = webCrowler.search(searchText);

System.out.println(“End Searching ” + links.size() + ” Results”);

endSearching();

searchResults();

}

});

searchLabel.setVisible(false);

searchTf.setVisible(false);

searchButton.setVisible(false);

searchResults.setVisible(false);

setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);

setLocationRelativeTo(null);

}

private void searching() {

indexingLabel.setText(“Searching”);

searchResults.setVisible(false);

indexingImage.setVisible(true);

indexingLabel.setVisible(true);

}

private void endSearching() {

searchResults.setVisible(true);

indexingImage.setVisible(false);

indexingLabel.setVisible(false);

}

private void searchResults() {

DefaultTableModel model = (DefaultTableModel) this.searchResults.getModel();

introwCount = model.getRowCount();

//Remove rows one by one from the end of the table

for (int i = rowCount – 1; i >= 0; i–) {

model.removeRow(i);

}

for (String link : links) {

model.addRow(new Object[]{link});

}

this.searchResults.setBounds(10, 50, 680, 400);

}

public static void main(String[] args) {

ApplicationContext context =

newClassPathXmlApplicationContext(“META-INF/config.xml”);

WebCrowlerwebCrowler = context.getBean(WebCrowler.class);

UserInterfaceuserInterface = new UserInterface(webCrowler);

webCrowler.process(userInterface);

userInterface.endIndexing();

}

public void updateUrl(String url) {

this.linkLabel.setText(url);

}

private void endIndexing() {

searchLabel.setVisible(true);

searchTf.setVisible(true);

searchButton.setVisible(true);

searchResults.setVisible(true);

indexingImage.setVisible(false);

indexingLabel.setVisible(false);

linkLabel.setVisible(false);

}

} 

WebCrowler.java 

packagecom.example.webCrowller;

importorg.jsoup.nodes.Document;

importorg.springframework.beans.factory.annotation.Autowired;

importorg.springframework.context.ApplicationContext;

import org.springframework.context.support.ClassPathXmlApplicationContext;

importorg.springframework.stereotype.Component;

importjava.io.File;

importjava.nio.file.Path;

importjava.util.HashSet;

importjava.util.LinkedList;

importjava.util.List;

@Component

public class WebCrowler {

private static final int MAX_DEPTH = 2;

private final HttpClienthttpClient;

private final Index index;

private final HttpParserhttpParser;

privateHashSet<String>visitedLinks = new HashSet<String>();

privateLinkedList<WebCrowlerUrl>urlQueue = new LinkedList<WebCrowlerUrl>();

public static final String STARTING_LINK = “http://www.unitec.ac.nz”;

public static final String INDEX_DIRECTORY_PATH = “/var/search”;

@Autowired

publicWebCrowler(HttpClienthttpClient, HttpParserhttpParser, Index index) {

this.httpClient = httpClient;

this.httpParser = httpParser;

this.index = index;

}

private void addToQueue(WebCrowlerUrlurl) {

if (!visitedLinks.contains(url) && !urlQueue.contains(url))

urlQueue.addLast(url);

}

privateWebCrowlerUrlpulUrl() {

returnurlQueue.removeFirst();

}

public void process(UserInterfaceuserInterface) {

urlQueue.add(new WebCrowlerUrl(STARTING_LINK));

while (!urlQueue.isEmpty()) {

WebCrowlerUrlurl = pulUrl();

visitedLinks.add(url.getUrl());

if (url.getDepth() <= MAX_DEPTH) {

userInterface.updateUrl(url.getUrl());

Document document = httpClient.downloadDocument(url.getUrl());

if (document != null) {

index.addDocument(document, url.getUrl());

List<String> links = httpParser.extractLinks(document);

for (String link : links) {

addToQueue(new WebCrowlerUrl(link, url.getDepth() + 1));

}

}

}

}

}

public List<String> search(String searchText) {

returnindex.search(searchText);

}

}

classWebCrowlerUrl {

private String url;

privateint depth;

publicWebCrowlerUrl(String url) {

this(url, 1);

}

publicWebCrowlerUrl(String url, int depth) {

this.url = url;

this.depth = depth;

}

public String getUrl() {

returnurl;

}

public void setUrl(String url) {

this.url = url;

}

publicintgetDepth() {

return depth;

}

public void setDepth(int depth) {

this.depth = depth;

}

@Override

publicboolean equals(Object obj) {

returnurl.equals(((WebCrowlerUrl) obj).url);

}

} 

HttpClientImpl.java 

packagecom.example.webCrowller.impl;

importcom.example.webCrowller.HttpClient;

importorg.jsoup.Jsoup;

importorg.jsoup.nodes.Document;

importorg.springframework.stereotype.Service;

importjava.io.IOException;

@Service

public class HttpClientImpl implements HttpClient {

public Document downloadDocument(String uri){

System.out.println(“Vesiting ” + uri);

try {

returnJsoup.connect(uri).get();

} catch (Exception e) {

System.err.println(“[EXCEPTION] “+ e.getMessage());

}

return null;

}

} 

HttpParserImpl.java 

packagecom.example.webCrowller.impl;

importcom.example.webCrowller.HttpParser;

importorg.jsoup.nodes.Document;

importorg.jsoup.nodes.Element;

importorg.jsoup.select.Elements;

importorg.springframework.stereotype.Service;

importjava.net.MalformedURLException;

importjava.net.URISyntaxException;

import java.net.URL;

importjava.util.ArrayList;

importjava.util.List;

@Service

public class HttpParserImpl implements HttpParser {

public List<String>extractLinks(Document document) {

List<String> result = new ArrayList<String>();

Elements links = document.select(“a[href]”);

for (Element link : links) {

String linkHref = link.attr(“href”);

if (isValidLink(linkHref))

result.add(linkHref);

}

return result;

}

privatebooleanisValidLink(String linkHref) {

if (linkHref.startsWith(“http://”) || linkHref.startsWith(“https://”)) {

try {

URL u = new URL(linkHref);

u.toURI();

return true;

} catch (Exception ignored) {

}

}

return false;

}

} 

IndexImpl.java 

packagecom.example.webCrowller.impl;

importcom.example.webCrowller.Index;

importorg.apache.lucene.analysis.standard.StandardAnalyzer;

importorg.apache.lucene.document.Document;

importorg.apache.lucene.document.Field;

importorg.apache.lucene.document.StringField;

importorg.apache.lucene.document.TextField;

importorg.apache.lucene.index.DirectoryReader;

importorg.apache.lucene.index.IndexReader;

importorg.apache.lucene.index.IndexWriter;

importorg.apache.lucene.index.IndexWriterConfig;

importorg.apache.lucene.queryparser.classic.ParseException;

importorg.apache.lucene.queryparser.classic.QueryParser;

importorg.apache.lucene.search.IndexSearcher;

importorg.apache.lucene.search.Query;

importorg.apache.lucene.search.ScoreDoc;

importorg.apache.lucene.search.TopScoreDocCollector;

importorg.apache.lucene.store.FSDirectory;

importorg.springframework.stereotype.Service;

importjava.io.File;

importjava.io.IOException;

importjava.util.ArrayList;

importjava.util.List;

import static com.example.webCrowller.WebCrowler.INDEX_DIRECTORY_PATH;

@Service

public class IndexImpl implements Index {

privateIndexWriter writer;

private static StandardAnalyzer analyzer = new StandardAnalyzer();

privateIndexSearcher searcher;

TopScoreDocCollector collector = TopScoreDocCollector.create(5);

privateFSDirectorydir;

publicIndexImpl() {

try {

dir = FSDirectory.open(new File(INDEX_DIRECTORY_PATH).toPath());

IndexWriterConfigconfig = new IndexWriterConfig(analyzer);

writer = new IndexWriter(dir, config);

} catch (IOException ignored) {

}

}

public void addDocument(org.jsoup.nodes.Document document, String url) {

try {

Document doc = new Document();

doc.add(new TextField(“contents”, document.text(), Field.Store.YES));

doc.add(new TextField(“url”, url, Field.Store.YES));

writer.addDocument(doc);

} catch (Exception e) {

System.err.println(“[EXCEPTION] ” + e.getMessage());

}

}

@Override

public List<String> search(String s) {

List<String> result = new ArrayList<>();

try {

writer.close();

dir.close();

IndexReader reader = null;

reader = DirectoryReader.open(FSDirectory.open(new File(INDEX_DIRECTORY_PATH).toPath()));

this.searcher = new IndexSearcher(reader);

Query q = new QueryParser(“contents”, analyzer).parse(s);

searcher.search(q, collector);

ScoreDoc[] hits = collector.topDocs().scoreDocs;

for (ScoreDoc hit : hits) {

intdocId = hit.doc;

Document document = searcher.doc(docId);

result.add(document.get(“url”));

}

reader.close();

} catch (Exception ignored) {

}

return result;

}

}