Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extracted interfaces from Parser and PageFetcher #421

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package edu.uci.ics.crawler4j.crawler;
Expand All @@ -21,7 +22,7 @@
* Several core components of crawler4j extend this class
* to make them configurable.
*
* @deprecated This will removed without notice.
* @deprecated This will be removed without notice.
* @author Yasser Ganjisaffar
*/
@Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.fetcher.PageFetcherInterface;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.DocIDServerInterface;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.frontier.FrontierInterface;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.parser.ParserInterface;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.TLDList;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
Expand Down Expand Up @@ -75,28 +78,28 @@ public class CrawlController {
*/
protected boolean shuttingDown;

protected PageFetcher pageFetcher;
protected PageFetcherInterface pageFetcher;
protected RobotstxtServer robotstxtServer;
protected Frontier frontier;
protected DocIDServer docIdServer;
protected FrontierInterface frontier;
protected DocIDServerInterface docIdServer;
protected TLDList tldList;

protected final Object waitingLock = new Object();
protected final Environment env;

protected Parser parser;
protected ParserInterface parser;

public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher,
RobotstxtServer robotstxtServer) throws Exception {
this(config, pageFetcher, null, robotstxtServer, null);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
this(config, pageFetcher, null, robotstxtServer, tldList);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parser,
public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, ParserInterface parser,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
config.validate();
this.config = config;
Expand Down Expand Up @@ -153,7 +156,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse
robotstxtServer.setCrawlConfig(config);
}

public Parser getParser() {
public ParserInterface getParser() {
return parser;
}

Expand Down Expand Up @@ -582,11 +585,11 @@ public void addSeenUrl(String url, int docId) throws UnsupportedEncodingExceptio
}
}

public PageFetcher getPageFetcher() {
public PageFetcherInterface getPageFetcher() {
return pageFetcher;
}

public void setPageFetcher(PageFetcher pageFetcher) {
public void setPageFetcher(PageFetcherInterface pageFetcher) {
this.pageFetcher = pageFetcher;
}

Expand All @@ -598,19 +601,19 @@ public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
this.robotstxtServer = robotstxtServer;
}

public Frontier getFrontier() {
public FrontierInterface getFrontier() {
return frontier;
}

public void setFrontier(Frontier frontier) {
public void setFrontier(FrontierInterface frontier) {
this.frontier = frontier;
}

public DocIDServer getDocIdServer() {
public DocIDServerInterface getDocIdServer() {
return docIdServer;
}

public void setDocIdServer(DocIDServer docIdServer) {
public void setDocIdServer(DocIDServerInterface docIdServer) {
this.docIdServer = docIdServer;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.fetcher.PageFetcherInterface;
import edu.uci.ics.crawler4j.frontier.DocIDServerInterface;
import edu.uci.ics.crawler4j.frontier.FrontierInterface;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.NotAllowedContentException;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.parser.ParserInterface;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;

Expand Down Expand Up @@ -71,12 +71,12 @@ public class WebCrawler implements Runnable {
/**
* The parser that is used by this crawler instance to parse the content of the fetched pages.
*/
private Parser parser;
private ParserInterface parser;

/**
* The fetcher that is used by this crawler instance to fetch the content of pages from the web.
*/
private PageFetcher pageFetcher;
private PageFetcherInterface pageFetcher;

/**
* The RobotstxtServer instance that is used by this crawler instance to
Expand All @@ -87,12 +87,12 @@ public class WebCrawler implements Runnable {
/**
* The DocIDServer that is used by this crawler instance to map each URL to a unique docid.
*/
private DocIDServer docIdServer;
private DocIDServerInterface docIdServer;

/**
* The Frontier object that manages the crawl queue.
*/
private Frontier frontier;
private FrontierInterface frontier;

/**
* Is the current crawler instance waiting for new URLs? This field is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
/**
* @author Yasser Ganjisaffar
*/
public class PageFetcher {
public class PageFetcher implements PageFetcherInterface {
protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class);
protected final Object mutex = new Object();
/**
Expand Down Expand Up @@ -251,6 +251,7 @@ private void doFormLogin(FormAuthInfo authInfo) {
}
}

@Override
public PageFetchResult fetchPage(WebURL webUrl)
throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
// Getting URL, setting headers & content
Expand Down Expand Up @@ -331,6 +332,7 @@ public PageFetchResult fetchPage(WebURL webUrl)
}
}

@Override
public synchronized void shutDown() {
if (connectionMonitorThread != null) {
connectionManager.shutdown();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package edu.uci.ics.crawler4j.fetcher;

import java.io.IOException;

import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.url.WebURL;

public interface PageFetcherInterface {

PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException,
PageBiggerThanMaxSizeException;

void shutDown();

}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
* @author Yasser Ganjisaffar
*/

public class DocIDServer {
public class DocIDServer implements DocIDServerInterface {
private static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);

private final Database docIDsDB;
Expand Down Expand Up @@ -68,6 +68,7 @@ public DocIDServer(Environment env, CrawlConfig config) {
* @param url the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is returned.
*/
@Override
public int getDocId(String url) {
synchronized (mutex) {
OperationStatus result = null;
Expand All @@ -93,6 +94,7 @@ public int getDocId(String url) {
}
}

@Override
public int getNewDocID(String url) {
synchronized (mutex) {
try {
Expand All @@ -117,6 +119,7 @@ public int getNewDocID(String url) {
}
}

@Override
public void addUrlAndDocId(String url, int docId) {
synchronized (mutex) {
if (docId <= lastDocID) {
Expand All @@ -139,10 +142,12 @@ public void addUrlAndDocId(String url, int docId) {
}
}

@Override
public boolean isSeenBefore(String url) {
return getDocId(url) != -1;
}

@Override
public final int getDocCount() {
try {
return (int) docIDsDB.count();
Expand All @@ -152,6 +157,7 @@ public final int getDocCount() {
}
}

@Override
public void close() {
try {
docIDsDB.close();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package edu.uci.ics.crawler4j.frontier;

public interface DocIDServerInterface {

/**
* Returns the docid of an already seen url.
*
* @param url the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is returned.
*/
int getDocId(String url);

int getNewDocID(String url);

void addUrlAndDocId(String url, int docId);

boolean isSeenBefore(String url);

int getDocCount();

void close();

}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
* @author Yasser Ganjisaffar
*/

public class Frontier {
public class Frontier implements FrontierInterface {
protected static final Logger logger = LoggerFactory.getLogger(Frontier.class);

private static final String DATABASE_NAME = "PendingURLsDB";
Expand Down Expand Up @@ -82,6 +82,7 @@ public Frontier(Environment env, CrawlConfig config) {
}
}

@Override
public void scheduleAll(List<WebURL> urls) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
Expand Down Expand Up @@ -109,6 +110,7 @@ public void scheduleAll(List<WebURL> urls) {
}
}

@Override
public void schedule(WebURL url) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
Expand All @@ -124,6 +126,7 @@ public void schedule(WebURL url) {
}
}

@Override
public void getNextURLs(int max, List<WebURL> result) {
while (true) {
synchronized (mutex) {
Expand Down Expand Up @@ -161,6 +164,7 @@ public void getNextURLs(int max, List<WebURL> result) {
}
}

@Override
public void setProcessed(WebURL webURL) {
counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES);
if (inProcessPages != null) {
Expand All @@ -170,10 +174,12 @@ public void setProcessed(WebURL webURL) {
}
}

@Override
public long getQueueLength() {
return workQueues.getLength();
}

@Override
public long getNumberOfAssignedPages() {
if (inProcessPages != null) {
return inProcessPages.getLength();
Expand All @@ -182,18 +188,22 @@ public long getNumberOfAssignedPages() {
}
}

@Override
public long getNumberOfProcessedPages() {
return counters.getValue(Counters.ReservedCounterNames.PROCESSED_PAGES);
}

@Override
public long getNumberOfScheduledPages() {
return counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES);
}

@Override
public boolean isFinished() {
return isFinished;
}

@Override
public void close() {
workQueues.close();
counters.close();
Expand All @@ -202,6 +212,7 @@ public void close() {
}
}

@Override
public void finish() {
isFinished = true;
synchronized (waitingList) {
Expand Down
Loading