From 2df7dca8711d226dd98bd0afefa4531a6d1e44b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 16:50:21 +0200 Subject: [PATCH] Changed refactor of processSingle again, this one is a better version (#1157) * Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot better --- .../codecraft/webmagic/model/Extractor.java | 6 +- .../webmagic/model/FieldExtractor.java | 1 + .../webmagic/model/PageModelExtractor.java | 36 +++++----- .../model/selections/MultipleSelection.java | 36 ---------- .../webmagic/model/selections/Selection.java | 9 --- .../model/selections/SingleSelection.java | 33 --------- .../webmagic/model/sources/Source.java | 68 +++++++++++++++++++ .../model/sources/SourceTextExtractor.java | 17 +++++ 8 files changed, 105 insertions(+), 101 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index d64adffd..67344758 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.model; import lombok.Getter; import lombok.Setter; + +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** @@ -20,9 +22,7 @@ public class Extractor { protected final boolean notNull; protected final boolean multi; - - public static enum Source {Html, Url, RawHtml, RawText} - + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a49ea776..d4cb5937 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index de71717f..751aafe7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -9,9 +9,9 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; -import us.codecraft.webmagic.model.selections.MultipleSelection; -import us.codecraft.webmagic.model.selections.Selection; -import us.codecraft.webmagic.model.selections.SingleSelection; +import us.codecraft.webmagic.model.sources.Source; +import us.codecraft.webmagic.model.sources.SourceTextExtractor; +import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -95,7 +95,7 @@ class PageModelExtractor { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, - new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -121,7 +121,7 @@ class PageModelExtractor { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -136,26 +136,23 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - ExtractBy.Source source0 = extractBy.source(); - if (extractBy.type()== ExtractBy.Type.JsonPath){ - source0 = RawText; - } - FieldExtractor.Source source = null; - switch (source0){ + ExtractBy.Source extractSource = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath) + extractSource = RawText; + Source source = null; + switch (extractSource) { case RawText: - source = FieldExtractor.Source.RawText; + source = new RawText(); break; case RawHtml: - source = FieldExtractor.Source.RawHtml; + source = new RawHtml(); break; case SelectedHtml: - source =FieldExtractor.Source.Html; + source = new SelectedHtml(); break; default: - source =FieldExtractor.Source.Html; - + source = new SelectedHtml(); } - fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); @@ -202,7 +199,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } @@ -242,8 +239,7 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); - PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); if (!field.operation(o, fieldExtractor, logger)) return null; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java deleted file mode 100644 index d49f9c57..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import java.util.List; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.MultipleField; - -public class MultipleSelection implements Selection { - public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - List fieldsName; - switch (fieldExtractor.getSource()) { - case RawHtml: - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - else - fieldsName = fieldExtractor.getSelector().selectList(html); - break; - case Url: - fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - fieldsName = fieldExtractor.getSelector().selectList(html); - } - if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - return new MultipleField(fieldsName); - } -} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java deleted file mode 100644 index e70ab9d9..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java +++ /dev/null @@ -1,9 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.PageField; - -public interface Selection { - public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java deleted file mode 100644 index a4c1fe45..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.SingleField; - -public class SingleSelection implements Selection { - public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - String field; - switch (fieldExtractor.getSource()) { - case RawHtml: - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - else - field = fieldExtractor.getSelector().select(html); - break; - case Url: - field = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - field = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - field = fieldExtractor.getSelector().select(html); - } - if (field == null && fieldExtractor.isNotNull()) - return null; - return new SingleField(field); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java new file mode 100644 index 00000000..14682722 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.model.sources; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; + +public interface Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + + public class RawHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } + } + + public class SelectedHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().selectList(html); + } + } + + public class Url implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getUrl().toString()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getUrl().toString()); + } + } + + public class RawText implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getRawText()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getRawText()); + } + } + + public class DefaultSource implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(html); + } + } +} + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java new file mode 100644 index 00000000..1e572695 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.model.sources; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; +import us.codecraft.webmagic.model.fields.PageField; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SourceTextExtractor { + public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + Source source = fieldExtractor.getSource(); + if (fieldExtractor.isMulti()) + return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); + else + return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); + } +} \ No newline at end of file