@ -1,7 +1,7 @@
package us.codecraft.webmagic.samples ;
import us.codecraft.webmagic.Site ;
import us.codecraft.webmagic.Page ;
import us.codecraft.webmagic.Site ;
import us.codecraft.webmagic.processor.PageProcessor ;
import java.util.List ;
@ -12,18 +12,34 @@ import java.util.List;
* Time : 下 午 8 : 0 8
* /
public class DiandianBlogProcessor implements PageProcessor {
private Site site ;
@Override
public void process ( Page page ) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List < String > requests = page . getHtml ( ) . rs ( "<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}" ) . toStrings ( ) ;
//a()表示提取链接, as()表示提取所有链接
//getHtml()返回Html对象, 支持链式调用
//r()表示用正则表达式提取一条内容, rs()表示提取多条内容
//toString()表示取单条结果, toStrings()表示取多条
List < String > requests = page . getHtml ( ) . as ( ) . rs ( "(.*/post/.*)" ) . toStrings ( ) ;
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page . addTargetRequests ( requests ) ;
page . putField ( "title" , page . getHtml ( ) . x ( "//div[@id='content']//h2/a" ) ) ;
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page . putField ( "title" , page . getHtml ( ) . x ( "//title" ) . r ( "(.*?)\\|" ) ) ;
//sc()使用readability技术直接抽取正文, 对于规整的文本有比较好的抽取正确率
page . putField ( "content" , page . getHtml ( ) . sc ( ) ) ;
page . putField ( "date" , page . getUrl ( ) . r ( "post/(\\d+-\\d+-\\d+)/" ) ) ;
page . putField ( "id" , page . getUrl ( ) . r ( "post/\\d+-\\d+-\\d+/(\\d+)" ) ) ;
}
@Override
public Site getSite ( ) {
return Site . me ( ) . setDomain ( "www.diandian.com" ) . setStartUrl ( "http://17dujingdian.com/" ) .
//site定义抽取配置, 以及开始url等
if ( site = = null ) {
site = Site . me ( ) . setDomain ( "progressdaily.diandian.com" ) . setStartUrl ( "http://progressdaily.diandian.com/" ) .
setUserAgent ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31" ) ;
}
return site ;
}
}