Changed refactor of processSingle again, this one is a better version (#1157)

* Refactor of processSingle in PageModelExtractor

* Changed my refactor of processSingle, this one is a lot better

* Changed my refactor of processSingle, this one is a lot better
pull/1158/head^2
François Gibier 10 months ago committed by GitHub
parent 05e5eefc7d
commit 2df7dca871
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -2,6 +2,8 @@ package us.codecraft.webmagic.model;
import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
/**
@ -21,8 +23,6 @@ public class Extractor {
protected final boolean multi;
public static enum Source {Html, Url, RawHtml, RawText}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;

@ -1,6 +1,7 @@
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;

@ -9,9 +9,9 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.selections.MultipleSelection;
import us.codecraft.webmagic.model.selections.Selection;
import us.codecraft.webmagic.model.selections.SingleSelection;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
@ -95,7 +95,7 @@ class PageModelExtractor {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@ -121,7 +121,7 @@ class PageModelExtractor {
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@ -136,26 +136,23 @@ class PageModelExtractor {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
ExtractBy.Source source0 = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath){
source0 = RawText;
}
FieldExtractor.Source source = null;
switch (source0){
ExtractBy.Source extractSource = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath)
extractSource = RawText;
Source source = null;
switch (extractSource) {
case RawText:
source = FieldExtractor.Source.RawText;
source = new RawText();
break;
case RawHtml:
source = FieldExtractor.Source.RawHtml;
source = new RawHtml();
break;
case SelectedHtml:
source =FieldExtractor.Source.Html;
source = new SelectedHtml();
break;
default:
source =FieldExtractor.Source.Html;
source = new SelectedHtml();
}
fieldExtractor = new FieldExtractor(field, selector, source,
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
@ -202,7 +199,7 @@ class PageModelExtractor {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
}
}
@ -242,8 +239,7 @@ class PageModelExtractor {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
if (!field.operation(o, fieldExtractor, logger))
return null;
}

@ -1,36 +0,0 @@
package us.codecraft.webmagic.model.selections;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.MultipleField;
public class MultipleSelection implements Selection {
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
List<String> fieldsName;
switch (fieldExtractor.getSource()) {
case RawHtml:
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw)
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
else
fieldsName = fieldExtractor.getSelector().selectList(html);
break;
case Url:
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
case RawText:
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
break;
default:
fieldsName = fieldExtractor.getSelector().selectList(html);
}
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
return null;
}
return new MultipleField(fieldsName);
}
}

@ -1,9 +0,0 @@
package us.codecraft.webmagic.model.selections;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.PageField;
public interface Selection {
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
}

@ -1,33 +0,0 @@
package us.codecraft.webmagic.model.selections;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.SingleField;
public class SingleSelection implements Selection {
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
String field;
switch (fieldExtractor.getSource()) {
case RawHtml:
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw)
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
else
field = fieldExtractor.getSelector().select(html);
break;
case Url:
field = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
case RawText:
field = fieldExtractor.getSelector().select(page.getRawText());
break;
default:
field = fieldExtractor.getSelector().select(html);
}
if (field == null && fieldExtractor.isNotNull())
return null;
return new SingleField(field);
}
}

@ -0,0 +1,68 @@
package us.codecraft.webmagic.model.sources;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
public interface Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
public class RawHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocument(fieldExtractor.getSelector());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
}
}
public class SelectedHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocument(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().select(html);
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().selectList(html);
}
}
public class Url implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getUrl().toString());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
}
}
public class RawText implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getRawText());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getRawText());
}
}
public class DefaultSource implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(html);
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(html);
}
}
}

@ -0,0 +1,17 @@
package us.codecraft.webmagic.model.sources;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.MultipleField;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.fields.SingleField;
public class SourceTextExtractor {
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
Source source = fieldExtractor.getSource();
if (fieldExtractor.isMulti())
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
else
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
}
}
Loading…
Cancel
Save