From 700898fe8acc7a32c1bdf339bc13beb2625819f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E7=8E=89=E6=98=86?= Date: Mon, 29 Aug 2016 17:07:37 +0800 Subject: [PATCH] =?UTF-8?q?fixed=20#301=20=E4=BF=AE=E5=A4=8D=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E6=B3=A8=E8=A7=A3=E6=8A=BD=E5=8F=96JSON=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/example/GithubRepoApi.java | 10 ++++---- .../codecraft/webmagic/model/Extractor.java | 2 +- .../webmagic/model/PageModelExtractor.java | 25 ++++++++++++++++++- .../webmagic/model/annotation/ExtractBy.java | 3 ++- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java index 34608fd9..4181bb9e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java @@ -15,19 +15,19 @@ import java.util.List; */ public class GithubRepoApi implements HasKey { - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name", source = ExtractBy.Source.RawText) private String name; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login", source = ExtractBy.Source.RawText) private String author; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true) + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true, source = ExtractBy.Source.RawText) private List language; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count", source = ExtractBy.Source.RawText) private int star; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.homepage") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.forks_count", source = ExtractBy.Source.RawText) private int fork; @ExtractByUrl diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index 32f561e2..f1d2f84d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -17,7 +17,7 @@ class Extractor { protected final boolean multi; - static enum Source {Html, Url, RawHtml} + static enum Source {Html, Url, RawHtml, RawText} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 9816c714..a1da94bd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -179,7 +179,24 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + + FieldExtractor.Source source = null; + switch (extractBy.source()){ + case RawText: + source = FieldExtractor.Source.RawText; + break; + case RawHtml: + source = FieldExtractor.Source.RawHtml; + break; + case SelectedHtml: + source =FieldExtractor.Source.Html; + break; + default: + source =FieldExtractor.Source.Html; + + } + + fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -284,6 +301,9 @@ class PageModelExtractor { case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); break; + case RawText: + value = fieldExtractor.getSelector().selectList(page.getRawText()); + break; default: value = fieldExtractor.getSelector().selectList(html); } @@ -312,6 +332,9 @@ class PageModelExtractor { case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; + case RawText: + value = fieldExtractor.getSelector().select(page.getRawText()); + break; default: value = fieldExtractor.getSelector().select(html); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 2e23aa00..8e02895a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -52,7 +52,8 @@ public @interface ExtractBy { /** * extract from the raw html */ - RawHtml + RawHtml, + RawText } /**