add more option to extractors

pull/17/head
yihua.huang 12 years ago
parent c70ed57025
commit 20705b34ac

@ -19,10 +19,17 @@ public class CssSelector implements Selector {
private String selectorText; private String selectorText;
private String attrName;
public CssSelector(String selectorText) { public CssSelector(String selectorText) {
this.selectorText = selectorText; this.selectorText = selectorText;
} }
public CssSelector(String selectorText, String attrName) {
this.selectorText = selectorText;
this.attrName = attrName;
}
@Override @Override
public String select(String text) { public String select(String text) {
Document doc = Jsoup.parse(text); Document doc = Jsoup.parse(text);
@ -30,7 +37,15 @@ public class CssSelector implements Selector {
if (CollectionUtils.isEmpty(elements)) { if (CollectionUtils.isEmpty(elements)) {
return null; return null;
} }
return elements.get(0).outerHtml(); return getValue(elements.get(0));
}
private String getValue(Element element) {
if (attrName == null) {
return element.outerHtml();
} else {
return element.attr(attrName);
}
} }
@Override @Override
@ -40,7 +55,10 @@ public class CssSelector implements Selector {
Elements elements = doc.select(selectorText); Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) { if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) { for (Element element : elements) {
strings.add(element.outerHtml()); String value = getValue(element);
if (value != null) {
strings.add(value);
}
} }
} }
return strings; return strings;

@ -69,4 +69,10 @@ public class Html extends PlainText {
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = new CssSelector(selector, attrName);
return selectList(cssSelector, strings);
}
} }

@ -40,6 +40,11 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public Selectable $(String selector, String attrName) {
throw new UnsupportedOperationException();
}
@Override @Override
public Selectable smartContent() { public Selectable smartContent() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
@ -56,6 +61,12 @@ public class PlainText implements Selectable {
return selectList(regexSelector, strings); return selectList(regexSelector, strings);
} }
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group);
return selectList(regexSelector, strings);
}
protected Selectable select(Selector selector, List<String> strings) { protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
for (String string : strings) { for (String string : strings) {

@ -26,6 +26,15 @@ public interface Selectable {
*/ */
public Selectable $(String selector); public Selectable $(String selector);
/**
* select list with css selector
*
* @param selector css selector expression
* @param attrName attribute name of css selector
* @return new Selectable after extract
*/
public Selectable $(String selector, String attrName);
/** /**
* select smart content with ReadAbility algorithm * select smart content with ReadAbility algorithm
* *
@ -41,13 +50,22 @@ public interface Selectable {
public Selectable links(); public Selectable links();
/** /**
* select list with regex * select list with regex, default group is group 1
* *
* @param regex * @param regex
* @return new Selectable after extract * @return new Selectable after extract
*/ */
public Selectable regex(String regex); public Selectable regex(String regex);
/**
* select list with regex
*
* @param regex
* @param group
* @return new Selectable after extract
*/
public Selectable regex(String regex, int group);
/** /**
* replace with regex * replace with regex
* *

@ -26,6 +26,10 @@ public class SelectorFactory {
return newSelector(RegexSelector.class, regex); return newSelector(RegexSelector.class, regex);
} }
public RegexSelector newRegexSelector(String regex, int group) {
return newSelector(RegexSelector.class, regex, String.valueOf(group));
}
public ReplaceSelector newReplaceSelector(String regex, String replacement) { public ReplaceSelector newReplaceSelector(String regex, String replacement) {
return newSelector(ReplaceSelector.class, regex, replacement); return newSelector(ReplaceSelector.class, regex, replacement);
} }

Loading…
Cancel
Save