add comments

pull/17/head
yihua.huang 12 years ago
parent c2e691a55f
commit 412abeb8df

@ -49,9 +49,7 @@ public class FreemarkerPipeline implements Pipeline {
template.process(page.getFields(), printWriter);
printWriter.close();
} catch (TemplateException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
}

@ -1,7 +1,7 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
@ -12,18 +12,34 @@ import java.util.List;
* Time: 8:08
*/
public class DiandianBlogProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
//a()表示提取链接as()表示提取所有链接
//getHtml()返回Html对象支持链式调用
//r()表示用正则表达式提取一条内容rs()表示提取多条内容
//toString()表示取单条结果toStrings()表示取多条
List<String> requests = page.getHtml().as().rs("(.*/post/.*)").toStrings();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests);
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().sc());
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().x("//title").r("(.*?)\\|"));
//sc()使用readability技术直接抽取正文对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().sc());
page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/"));
page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)"));
}
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
//site定义抽取配置以及开始url等
if (site == null) {
site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
}

@ -13,6 +13,9 @@ import java.util.List;
* Time: 8:08
*/
public class DiaoyuwengProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
List<String> requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
@ -29,7 +32,10 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
}
return site;
}
}

@ -3,6 +3,6 @@ touch wordpress.xml
cat wp-head.xml >> wordpress.xml
for f in `ls`;
do
cat ${f} >> wordpress.xml
cat ${f} >> ../wordpress.xml
done;
cat wp-bottom.xml >> wordpress.xml

@ -0,0 +1,34 @@
package us.codecraft.webmagic.processor;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
import java.io.IOException;
/**
* User: cairne
* Date: 13-6-9
* Time: 8:02
*/
public class DiandianProcessorTest {
@Test
public void test() throws IOException {
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
//pipeline是抓取结束后的处理
//ftl文件放到classpath:ftl/文件夹下
//默认放到/data/temp/webmagic/ftl/[domain]目录下
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
//Spider.me()是简化写法其实就是new一个啦
//Spider.pipeline()设定一个pipeline支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")).
processor(diaoyuwengProcessor).run();
}
}
Loading…
Cancel
Save