Refactored and implement of a template method pattern for logger config in webmagic-scripts (#1158)

* Refactor of processSingle in PageModelExtractor

* Changed my refactor of processSingle, this one is a lot better

* Changed my refactor of processSingle, this one is a lot better

* add lombok for getters and setters

* Refactored and implement of a template method pattern for logger config
pull/1169/head
François Gibier 10 months ago committed by GitHub
parent 2df7dca871
commit d8321baf56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -53,6 +53,12 @@
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>

@ -0,0 +1,47 @@
package us.codecraft.webmagic.scripts;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.scripts.languages.JRuby;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Language;
import us.codecraft.webmagic.utils.WMCollections;
public class Params {
@Getter
Language language = new Javascript();
@Getter @Setter
String scriptFileName;
@Getter @Setter
List<String> urls;
@Getter @Setter
int thread = 1;
@Getter @Setter
int sleepTime = 1000;
private static Map<Language, Set<String>> alias;
public Params() {
alias = new HashMap<Language, Set<String>>();
alias.put(new Javascript(), WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(new JRuby(), WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
if (languageSetEntry.getValue().contains(arg)) {
this.language = languageSetEntry.getKey();
return;
}
}
}
}

@ -1,90 +1,21 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.cli.*;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.scripts.config.CommandLineOption;
import us.codecraft.webmagic.utils.WMCollections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @author code4crafter@gmail.com
* @author code4crafter@gmail.com / FrancoisGib
* @since 0.4.1
*/
public class ScriptConsole {
private static class Params {
Language language = Language.JavaScript;
String scriptFileName;
List<String> urls;
int thread = 1;
int sleepTime = 1000;
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
static {
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
if (languageSetEntry.getValue().contains(arg)) {
this.language = languageSetEntry.getKey();
return;
}
}
}
private Language getLanguage() {
return language;
}
private void setLanguage(Language language) {
this.language = language;
}
private String getScriptFileName() {
return scriptFileName;
}
private void setScriptFileName(String scriptFileName) {
this.scriptFileName = scriptFileName;
}
private List<String> getUrls() {
return urls;
}
private void setUrls(List<String> urls) {
this.urls = urls;
}
private int getThread() {
return thread;
}
private void setThread(int thread) {
this.thread = thread;
}
private int getSleepTime() {
return sleepTime;
}
private void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
}
public static void main(String[] args) {
Params params = parseCommand(args);
startSpider(params);
@ -142,45 +73,9 @@ public class ScriptConsole {
private static Params readOptions(CommandLine commandLine) {
Params params = new Params();
if (commandLine.hasOption("l")) {
String language = commandLine.getOptionValue("l");
params.setLanguagefromArg(language);
}
if (commandLine.hasOption("f")) {
String scriptFilename = commandLine.getOptionValue("f");
params.setScriptFileName(scriptFilename);
} else {
exit();
}
if (commandLine.hasOption("s")) {
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
params.setSleepTime(sleepTime);
}
if (commandLine.hasOption("t")) {
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
params.setThread(thread);
}
if (commandLine.hasOption("g")) {
configLogger(commandLine.getOptionValue("g"));
}
params.setUrls(commandLine.getArgList());
List<CommandLineOption> options = CommandLineOption.getAllOptions();
for (CommandLineOption option : options)
option.addParamOptionIfInCommandLine(params, commandLine);
return params;
}
private static void configLogger(String value) {
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
if ("debug".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.DEBUG);
} else if ("info".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.INFO);
} else if ("warn".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.WARN);
} else if ("trace".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.TRACE);
} else if ("off".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.OFF);
} else if ("error".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.ERROR);
}
}
}

@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import us.codecraft.webmagic.scripts.languages.Language;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger;
*/
public class ScriptEnginePool {
private final int size;
private final AtomicInteger availableCount;
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
public ScriptEnginePool(Language language,int size) {
this.size = size;
this.availableCount = new AtomicInteger(size);
for (int i=0;i<size;i++){
ScriptEngineManager manager = new ScriptEngineManager();

@ -4,17 +4,14 @@ package us.codecraft.webmagic.scripts;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash;
import org.python.core.PyDictionary;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scripts.languages.Language;
/**
* @author code4crafter@gmail.com
@ -55,35 +52,7 @@ public class ScriptProcessor implements PageProcessor {
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
switch (language) {
case JavaScript:
engine.eval(defines + "\n" + script, context);
// NativeObject o = (NativeObject) engine.get("result");
// if (o != null) {
// for (Object o1 : o.getIds()) {
// String key = String.valueOf(o1);
// page.getResultItems().put(key, NativeObject.getProperty(o, key));
// }
// }
break;
case JRuby:
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context);
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry) itruby.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
case Jython:
engine.eval(defines + "\n" + script, context);
PyDictionary oJython = (PyDictionary) engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry) it.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
}
this.language.process(engine, defines, script, page);
} catch (ScriptException e) {
e.printStackTrace();
}

@ -7,6 +7,9 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Language;
/**
* @author code4crafter@gmail.com
@ -14,7 +17,7 @@ import org.apache.commons.io.IOUtils;
*/
public class ScriptProcessorBuilder {
private static final Language DefaultLanguage = Language.JavaScript;
private static final Language DefaultLanguage = new Javascript();
private Language language = DefaultLanguage;
@ -39,7 +42,6 @@ public class ScriptProcessorBuilder {
InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
@ -50,7 +52,6 @@ public class ScriptProcessorBuilder {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;

@ -0,0 +1,82 @@
package us.codecraft.webmagic.scripts.config;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import lombok.Getter;
import us.codecraft.webmagic.scripts.Params;
public abstract class CommandLineOption {
@Getter
char option;
public CommandLineOption(char option) {
this.option = option;
}
protected abstract void addParamOption(Params params, CommandLine commandLine);
public void addParamOptionIfInCommandLine(Params params, CommandLine commandLine) {
if (commandLine.hasOption(this.option))
this.addParamOption(params, commandLine);
}
public static List<CommandLineOption> getAllOptions() {
return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
}
}
class OptionL extends CommandLineOption {
public OptionL() {
super('l');
}
protected void addParamOption(Params params, CommandLine commandLine) {
String language = commandLine.getOptionValue("l");
params.setLanguagefromArg(language);
}
}
class OptionF extends CommandLineOption {
public OptionF() {
super('f');
}
protected void addParamOption(Params params, CommandLine commandLine) {
String scriptFilename = commandLine.getOptionValue("f");
params.setScriptFileName(scriptFilename);
}
}
class OptionS extends CommandLineOption {
public OptionS() {
super('s');
}
protected void addParamOption(Params params, CommandLine commandLine) {
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
params.setSleepTime(sleepTime);
}
}
class OptionT extends CommandLineOption {
public OptionT() {
super('t');
}
protected void addParamOption(Params params, CommandLine commandLine) {
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
params.setThread(thread);
}
}
class OptionG extends CommandLineOption {
public OptionG() {
super('g');
}
protected void addParamOption(Params params, CommandLine commandLine) {
ConfigLogger.configLogger(commandLine.getOptionValue("g"));
}
}

@ -0,0 +1,34 @@
package us.codecraft.webmagic.scripts.config;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.Logger;
import org.slf4j.LoggerFactory;
public class ConfigLogger {
/**
* Log the config parameter. If the counter is less than the number of available
* options then it means that the user entered an option
*
* @param value The config string
*/
public static void configLogger(String value) {
List<Pair<String, Level>> options = List.of(
Pair.of("debug", Level.DEBUG),
Pair.of("info", Level.INFO),
Pair.of("warn", Level.WARN),
Pair.of("trace", Level.TRACE),
Pair.of("off", Level.OFF),
Pair.of("error", Level.ERROR));
Pair<String, Level> option = options.get(0);
int i = 1;
while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
option = options.get(i++);
if (i < options.size()) {
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
rootLogger.setLevel(option.getRight());
}
}
}

@ -0,0 +1,26 @@
package us.codecraft.webmagic.scripts.languages;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.jruby.RubyHash;
import us.codecraft.webmagic.Page;
public class JRuby extends Language {
public JRuby() {
super("jruby","ruby/defines.rb","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry) itruby.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
}
}

@ -0,0 +1,16 @@
package us.codecraft.webmagic.scripts.languages;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import us.codecraft.webmagic.Page;
public class Javascript extends Language {
public Javascript() {
super("javascript","js/defines.js","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
engine.eval(defines + "\n" + script, engine.getContext());
}
}

@ -0,0 +1,27 @@
package us.codecraft.webmagic.scripts.languages;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.python.core.PyDictionary;
import us.codecraft.webmagic.Page;
public class Jython extends Language {
public Jython() {
super("jython","python/defines.py","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
engine.eval(defines + "\n" + script, engine.getContext());
PyDictionary oJython = (PyDictionary) engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry) it.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
}
}

@ -1,15 +1,18 @@
package us.codecraft.webmagic.scripts;
package us.codecraft.webmagic.scripts.languages;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @author FrancoisGib
*/
public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb",""),
Jython("jython","python/defines.py","");
public abstract class Language {
public Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
private String engineName;
@ -17,12 +20,6 @@ public enum Language {
private String gatherFile;
Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
public String getEngineName() {
return engineName;
}
@ -34,4 +31,6 @@ public enum Language {
public String getGatherFile() {
return gatherFile;
}
public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException;
}

@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scripts.languages.JRuby;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Jython;
/**
* @author code4crafter@gmail.com
@ -13,14 +17,14 @@ public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@ -28,7 +32,7 @@ public class ScriptProcessorTest {
@Test
public void testPythonProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}

Loading…
Cancel
Save