From b18216245b2130ccf9ed984011b799d0cb8ac8ad Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 07:53:33 +0800 Subject: [PATCH] add type convert --- .../webmagic/model/FieldExtractor.java | 11 ++ .../webmagic/model/PageModelExtractor.java | 100 ++++++++++-- .../webmagic/model/annotation/Formatter.java | 41 +++++ .../model/formatter/BasicTypeFormatter.java | 150 ++++++++++++++++++ .../model/formatter/DateFormatter.java | 29 ++++ .../model/formatter/ObjectFormatter.java | 14 ++ .../model/formatter/ObjectFormatters.java | 27 ++++ .../codecraft/webmagic/model/GithubRepo.java | 12 +- 8 files changed, 368 insertions(+), 16 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index 600e184a..a2cba133 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.model; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; @@ -16,6 +17,8 @@ class FieldExtractor extends Extractor { private Method setterMethod; + private ObjectFormatter objectFormatter; + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; @@ -44,4 +47,12 @@ class FieldExtractor extends Extractor { boolean isNotNull() { return notNull; } + + ObjectFormatter getObjectFormatter() { + return objectFormatter; + } + + void setObjectFormatter(ObjectFormatter objectFormatter) { + this.objectFormatter = objectFormatter; + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 54d942c1..370b0fb2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.model.formatter.BasicTypeFormatter; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.formatter.ObjectFormatters; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -36,6 +40,8 @@ class PageModelExtractor { private Extractor objectExtractor; + private Logger logger = Logger.getLogger(getClass()); + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -62,16 +68,60 @@ class PageModelExtractor { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { - if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } + checkFormat(field, fieldExtractor); fieldExtractors.add(fieldExtractor); } } } + private void checkFormat(Field field, FieldExtractor fieldExtractor) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + Class fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType()); + ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz); + if (objectFormatter == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); + } else { + fieldExtractor.setObjectFormatter(objectFormatter); + } + } else if (fieldExtractor.isMulti()) { + if (!List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null) { + if (!formatter.subClazz().equals(Void.class)) { + ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz()); + if (objectFormatter == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); + } else { + fieldExtractor.setObjectFormatter(objectFormatter); + } + } + } + } + } + + private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz) { + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null) { + if (!formatter.formatter().equals(ObjectFormatter.class)) { + return initFormatter(formatter); + } + } + return ObjectFormatters.get(fieldClazz); + } + + private ObjectFormatter initFormatter(Formatter formatter) { + try { + return formatter.formatter().newInstance(); + } catch (InstantiationException e) { + logger.error("init ObjectFormatter fail", e); + } catch (IllegalAccessException e) { + logger.error("init ObjectFormatter fail", e); + } + return null; + } + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); @@ -231,7 +281,12 @@ class PageModelExtractor { if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { return null; } - setField(o, fieldExtractor, value); + if (fieldExtractor.getObjectFormatter() != null) { + List converted = convert(value, fieldExtractor.getObjectFormatter()); + setField(o, fieldExtractor, converted); + } else { + setField(o, fieldExtractor, value); + } } else { String value; switch (fieldExtractor.getSource()) { @@ -254,22 +309,47 @@ class PageModelExtractor { if (value == null && fieldExtractor.isNotNull()) { return null; } - setField(o, fieldExtractor, value); + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = convert(value, fieldExtractor.getObjectFormatter()); + setField(o, fieldExtractor, converted); + } else { + setField(o, fieldExtractor, value); + } } } if (AfterExtractor.class.isAssignableFrom(clazz)) { ((AfterExtractor) o).afterProcess(page); } } catch (InstantiationException e) { - e.printStackTrace(); + logger.error("extract fail", e); } catch (IllegalAccessException e) { - e.printStackTrace(); + logger.error("extract fail", e); } catch (InvocationTargetException e) { - e.printStackTrace(); + logger.error("extract fail", e); } return o; } + private Object convert(String value, ObjectFormatter objectFormatter) { + try { + return objectFormatter.format(value); + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + private List convert(List values, ObjectFormatter objectFormatter) { + List objects = new ArrayList(); + for (String value : values) { + Object converted = convert(value, objectFormatter); + if (converted != null) { + objects.add(converted); + } + } + return objects; + } + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java new file mode 100644 index 00000000..e603c59f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.model.annotation; + +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * Define how the result string is convert to an object for field. + * + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Formatter { + + /** + * Set formatter params. + * + * @return formatter params + */ + String[] value(); + + /** + * Specific the class of field of class of elements in collection for field.
+ * It is not necessary to be set because we can detect the class by class of field, + * unless you use a collection as a field.
+ * + * @return the class of field + */ + Class subClazz() default Void.class; + + /** + * If there are more than one formatter for a class, just specify the implement. + * @return implement + */ + Class formatter() default ObjectFormatter.class; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java new file mode 100644 index 00000000..2669582a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -0,0 +1,150 @@ +package us.codecraft.webmagic.model.formatter; + +import java.util.Arrays; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public abstract class BasicTypeFormatter implements ObjectFormatter { + + @Override + public void initParam(String[] extra) { + + } + + @Override + public T format(String raw) throws Exception { + if (raw == null) { + return null; + } + raw = raw.trim(); + return formatTrimmed(raw); + } + + protected abstract T formatTrimmed(String raw) throws Exception; + + public static final List basicTypeFormatters = Arrays.asList(new IntegerFormatter(), + new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(), + new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter()); + + public static Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return type; + } + + public static class IntegerFormatter extends BasicTypeFormatter { + @Override + public Integer formatTrimmed(String raw) throws Exception { + return Integer.parseInt(raw); + } + + @Override + public Class clazz() { + return Integer.class; + } + } + + public static class LongFormatter extends BasicTypeFormatter { + @Override + public Long formatTrimmed(String raw) throws Exception { + return Long.parseLong(raw); + } + + @Override + public Class clazz() { + return Long.class; + } + } + + public static class DoubleFormatter extends BasicTypeFormatter { + @Override + public Double formatTrimmed(String raw) throws Exception { + return Double.parseDouble(raw); + } + + @Override + public Class clazz() { + return Double.class; + } + } + + public static class FloatFormatter extends BasicTypeFormatter { + @Override + public Float formatTrimmed(String raw) throws Exception { + return Float.parseFloat(raw); + } + + @Override + public Class clazz() { + return Float.class; + } + } + + public static class ShortFormatter extends BasicTypeFormatter { + @Override + public Short formatTrimmed(String raw) throws Exception { + return Short.parseShort(raw); + } + + @Override + public Class clazz() { + return Short.class; + } + } + + public static class CharactorFormatter extends BasicTypeFormatter { + @Override + public Character formatTrimmed(String raw) throws Exception { + return raw.charAt(0); + } + + @Override + public Class clazz() { + return Character.class; + } + } + + public static class ByteFormatter extends BasicTypeFormatter { + @Override + public Byte formatTrimmed(String raw) throws Exception { + return Byte.parseByte(raw, 10); + } + + @Override + public Class clazz() { + return Byte.class; + } + } + + public static class BooleanFormatter extends BasicTypeFormatter { + @Override + public Boolean formatTrimmed(String raw) throws Exception { + return Boolean.parseBoolean(raw); + } + + @Override + public Class clazz() { + return Boolean.class; + } + } + + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java new file mode 100644 index 00000000..0ad0302b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.model.formatter; + +import org.apache.commons.lang3.time.DateUtils; + +import java.util.Date; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class DateFormatter implements ObjectFormatter { + + private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"}; + + @Override + public Date format(String raw) throws Exception { + return DateUtils.parseDate(raw, datePatterns); + } + + @Override + public Class clazz() { + return Date.class; + } + + @Override + public void initParam(String[] extra) { + datePatterns = extra; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java new file mode 100644 index 00000000..aea7272b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.model.formatter; + +/** + * @author code4crafter@gmail.com + */ +public interface ObjectFormatter { + + T format(String raw) throws Exception; + + Class clazz(); + + void initParam(String[] extra); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java new file mode 100644 index 00000000..6dedc3ce --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model.formatter; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class ObjectFormatters { + + private static Map formatterMap = new ConcurrentHashMap(); + + static { + for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { + put(basicTypeFormatter); + } + } + + public static void put(ObjectFormatter objectFormatter) { + formatterMap.put(objectFormatter.clazz(), objectFormatter); + } + + public static ObjectFormatter get(Class clazz){ + return formatterMap.get(clazz); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java index 5b6319a0..a9e049b1 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -32,10 +32,10 @@ public class GithubRepo implements HasKey { private List language; @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") - private String star; + private int star; @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") - private String fork; + private int fork; @ExtractByUrl private String url; @@ -46,8 +46,8 @@ public class GithubRepo implements HasKey { , new PageModelPipeline() { @Override public void process(GithubRepo o, Task task) { - Assert.assertEquals("78",o.getStar().trim()); - Assert.assertEquals("65",o.getFork().trim()); + Assert.assertEquals(78, o.getStar()); + Assert.assertEquals(65, o.getFork()); } }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); } @@ -77,11 +77,11 @@ public class GithubRepo implements HasKey { return url; } - public String getStar() { + public int getStar() { return star; } - public String getFork() { + public int getFork() { return fork; } }