#42 Add jsonpath in annotation mode for json result

pull/44/head
yihua.huang 11 years ago
parent c2d6d495b3
commit 59ad4cad27

@ -9,7 +9,7 @@ import java.util.ArrayList;
import java.util.List;
/**
* Selectable plain text.<br>
* Selectable html.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
@ -23,16 +23,28 @@ public class Html extends PlainText {
*/
private Document document;
private boolean init = false;
public Html(List<String> strings) {
super(strings);
}
public Html(String text) {
super(text);
try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
logger.warn("parse document error ", e);
}
/**
* lazy init
*/
private void initDocument() {
if (this.document == null && !init) {
init = true;
//just init once whether the parsing succeeds or not
try {
this.document = Jsoup.parse(getText());
} catch (Exception e) {
logger.warn("parse document error ", e);
}
}
}
@ -47,6 +59,7 @@ public class Html extends PlainText {
@Override
protected Selectable select(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
@ -59,6 +72,7 @@ public class Html extends PlainText {
@Override
protected Selectable selectList(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
List<String> result = selector.selectList(string);
@ -69,6 +83,7 @@ public class Html extends PlainText {
@Override
public Selectable smartContent() {
initDocument();
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings);
}

@ -0,0 +1,24 @@
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class AppStore {
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
private String trackName;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
private String description;
public static void main(String[] args) {
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
System.out.println(appStore.trackName);
System.out.println(appStore.description);
}
}

@ -239,7 +239,7 @@ class PageModelExtractor {
} else {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
for (String s : list) {
Object o = processSingle(page, s, false);
if (o != null) {
@ -248,7 +248,7 @@ class PageModelExtractor {
}
return os;
} else {
String select = objectExtractor.getSelector().select(page.getHtml().toString());
String select = objectExtractor.getSelector().select(page.getRawText());
Object o = processSingle(page, select, false);
return o;
}

@ -24,7 +24,7 @@ public @interface ExtractBy {
/**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
public static enum Type {XPath, Regex, Css, JsonPath}
/**
* Extractor type, support XPath, CSS Selector and regex.

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Define a extractor for url. Only regex can be used. <br>
* Define a extractor to extract data in url of current page. Only regex can be used. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0

@ -27,6 +27,9 @@ public class ExtractorUtils {
case XPath:
selector = getXpathSelector(value);
break;
case JsonPath:
selector = new JsonPathSelector(value);
break;
default:
selector = getXpathSelector(value);
}

Loading…
Cancel
Save