diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageProcessor.java new file mode 100644 index 00000000..39705466 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageProcessor.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.PageMapper; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class GithubRepoPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3); + + private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); + GithubRepo githubRepo = githubRepoPageMapper.get(page); + page.putField("repo",githubRepo); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java new file mode 100644 index 00000000..f23d9366 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafer@gmail.com + * @since 0.5.2 + */ +public class PageMapper { + + private Class clazz; + + public PageMapper(Class clazz) { + this.clazz = clazz; + } + + public T get(Page page){ + return null; + } +}