Changed refactor of processSingle again, this one is a better version (#1157)
* Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot betterpull/1158/head^2
parent
05e5eefc7d
commit
2df7dca871
@ -1,36 +0,0 @@
|
||||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
|
||||
public class MultipleSelection implements Selection {
|
||||
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
List<String> fieldsName;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw)
|
||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||
break;
|
||||
case Url:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
return new MultipleField(fieldsName);
|
||||
}
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
|
||||
public interface Selection {
|
||||
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SingleSelection implements Selection {
|
||||
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
String field;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw)
|
||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
field = fieldExtractor.getSelector().select(html);
|
||||
break;
|
||||
case Url:
|
||||
field = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
field = fieldExtractor.getSelector().select(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
field = fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
if (field == null && fieldExtractor.isNotNull())
|
||||
return null;
|
||||
return new SingleField(field);
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public interface Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
|
||||
public class RawHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
}
|
||||
}
|
||||
|
||||
public class SelectedHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
|
||||
public class Url implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
||||
public class RawText implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getRawText());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
}
|
||||
}
|
||||
|
||||
public class DefaultSource implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,17 @@
|
||||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SourceTextExtractor {
|
||||
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
Source source = fieldExtractor.getSource();
|
||||
if (fieldExtractor.isMulti())
|
||||
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||
else
|
||||
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue