webmagic-core

pull/17/head
yihua.huang 12 years ago
parent 17f8ead28f
commit 90bbe9b951

@ -9,13 +9,13 @@ import java.util.List;
/**
*
* Object storing extracted result and urls to be crawled.<br>
* Object storing extracted result and urls to fetch.<br>
* Main method <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
@ -71,7 +71,7 @@ public class Page {
}
/**
* add urls to crawl
* add urls to fetch
*
* @param requests
*/
@ -88,7 +88,7 @@ public class Page {
}
/**
* add url to crawl
* add url to fetch
*
* @param requestString
*/
@ -103,7 +103,7 @@ public class Page {
}
/**
* add requests to crawl
* add requests to fetch
*
* @param request
*/

@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Downloaderwebmagicwebmagic使HttpComponent<br>
* Downloader is the part that downloads web pages and store in Page object. <br>
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 12:14
* @since 0.1.0
*/
public interface Downloader {
/**
* Page
* Downloads web pages and store in Page object.
*
* @param request
* @param task
@ -23,10 +24,8 @@ public interface Downloader {
public Page download(Request request, Task task);
/**
* 线线Downloader<br>
* 线<br>
*
* @param thread 线
* Tell the downloader how many threads the spider used.
* @param threadNum number of threads
*/
public void setThread(int thread);
public void setThread(int threadNum);
}

@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
@ -22,12 +23,12 @@ import java.util.Set;
/**
* HttpClientgzipUA/cookie<br>
* The http downloader based on HttpClient.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 12:15
* @since 0.1.0
*/
@ThreadSafe
public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass());
@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader {
private int poolSize = 1;
/**
* 便
* A simple method to download a url.
*
* @param url
* @return html
*/
public Html download(String url) {
Page page = download(new Request(url), null);
return (Html)page.getHtml();
return (Html) page.getHtml();
}
@Override

@ -20,8 +20,7 @@ import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 12:29
* @since 0.1.0
*/
public class HttpClientPool {

@ -1,5 +1,5 @@
<html>
<body>
包含了页面下载的接口Downloader和实现类HttpClientDownloader该实现类封装了HttpComponent库。
Downloader is the part that downloads web pages and store in Page object.
</body>
</html>

@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task;
import java.util.Map;
/**
* <br>
* Write results in console.<br>
* Usually used in test.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:45
* @since 0.1.0
*/
public class ConsolePipeline implements Pipeline {

@ -1,6 +1,7 @@
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
@ -12,28 +13,23 @@ import java.io.PrintWriter;
import java.util.Map;
/**
*
* Store results in files.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 6:28
* @since 0.1.0
*/
public class FilePipeline extends FilePersistentBase implements Pipeline {
@ThreadSafe
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
/**
* FilePipeline使"/data/webmagic/"
* create a FilePipeline with default path"/data/webmagic/"
*/
public FilePipeline() {
setPath("/data/webmagic/");
}
/**
* FilePipeline
*
* @param path
*/
public FilePipeline(String path) {
setPath(path);
}

@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
* Pipeline线Pipeline()
* Pipeline is the persistent and offline process part of crawler.<br>
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:39
* @since 0.1.0
* @see ConsolePipeline
* @see FilePipeline
*/
public interface Pipeline {
public void process(ResultItems resultItems,Task task);
/**
* Process extracted results.
*
* @param resultItems
* @param task
*/
public void process(ResultItems resultItems, Task task);
}

@ -1,5 +1,5 @@
<html>
<body>
包含了处理页面抽取结果的接口Pipeline和它的几个实现类。
Pipeline is the persistent and offline process part of crawler.
</body>
</html>

@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* PageProcessor<br>
* extends the class to implements various spiders.<br>
* Interface to be implemented to customize a crawler.<br>
* <br>
* In PageProcessor, you can customize:
* <p/>
* start urls and other settings in {@link Site}<br>
* how the urls to fetch are detected <br>
* how the data are extracted and stored <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 11:42
* @see Site
* @see Page
* @since 0.1.0
*/
public interface PageProcessor {
/**
*
* process the page, extract urls to fetch, extract the data and store
*
* @param page
*/
public void process(Page page);
/**
* cookieUA
* get the site settings
*
* @return site
* @see Site
*/
public Site getSite();
}

@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
/**
* 使content<br>
* A simple PageProcessor.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-22
* Time: 9:15
* @since 0.1.0
*/
public class SimplePageProcessor implements PageProcessor {
@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor {
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl));
//compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).all();
//调用page.addTargetRequests()方法添加待抓取链接
//add urls to fetch
page.addTargetRequests(requests);
//xpath方式抽取
//extract by XPath
page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文
page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent());
}
@Override
public Site getSite() {
//定义抽取站点的相关参数
//settings
return site;
}
}

@ -1,5 +1,5 @@
<html>
<body>
包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。
PageProcessor custom part of a crawler for specific site.
</body>
</html>

@ -1,5 +1,6 @@
package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 线Scheduler<br>
* Basic Scheduler implementation.<br>
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:13
* @since 0.1.0
*/
@ThreadSafe
public class QueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass());
@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler {
private Set<String> urls = new HashSet<String>();
@Override
public synchronized void push(Request request,Task task) {
if (logger.isDebugEnabled()){
logger.debug("push to queue "+request.getUrl());
public synchronized void push(Request request, Task task) {
if (logger.isDebugEnabled()) {
logger.debug("push to queue " + request.getUrl());
}
if (urls.add(request.getUrl())){
if (urls.add(request.getUrl())) {
queue.add(request);
}

@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* urlurlurl<br>
* SchedulerTaskSchedulerTask(SpiderTask)<br>
* Scheduler is the part of url management.<br>
* You can implement interface Scheduler to do:
* manage urls to fetch
* remove duplicate urls
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:12
* @since 0.1.0
*/
public interface Scheduler {
/**
*
* @param request
* @param task SchedulerTask
* add a url to fetch
*
* @param request
* @param task
*/
public void push(Request request,Task task);
public void push(Request request, Task task);
/**
*
*
* @param task SchedulerTask
* @return
*/

@ -1,5 +1,5 @@
<html>
<body>
包含url管理和调度的接口Scheduler及它的几个实现类。
Scheduler is the part of url management.
</body>
</html>

@ -4,6 +4,8 @@ import java.util.ArrayList;
import java.util.List;
/**
* All selectors will be arranged as a pipeline. <br>
* The next selector uses the result of the previous as source.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/

@ -10,10 +10,10 @@ import java.util.ArrayList;
import java.util.List;
/**
* cssJsoup<br>
* CSS selector. Based on Jsoup.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 9:39
* @since 0.1.0
*/
public class CssSelector implements Selector {

@ -4,6 +4,8 @@ import java.util.ArrayList;
import java.util.List;
/**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/

@ -4,13 +4,16 @@ import java.util.List;
/**
* Selector(extractor) for text.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public interface Selector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param text
* @return result
*/
@ -18,6 +21,7 @@ public interface Selector {
/**
* Extract all results in text.<br>
*
* @param text
* @return results
*/

@ -24,7 +24,7 @@
{@link #getHtml()} get content of current page
{@link #putField(String, Object)} save extracted result
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
</pre>

Loading…
Cancel
Save