From 81bb809dba18c7dfb7b24a65285fa1a1bec709c5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 12 Nov 2013 10:38:12 +0800 Subject: [PATCH] update scripts --- webmagic-scripts/pom.xml | 57 +++++++ .../webmagic/scripts/ScriptConsole.java | 147 ++++++++++++++++++ webmagic-scripts/src/main/resources/log4j.xml | 21 +++ 3 files changed, 225 insertions(+) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java create mode 100644 webmagic-scripts/src/main/resources/log4j.xml diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0d5308d1..0c44c1b9 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -18,6 +18,11 @@ jruby 1.7.6 + + commons-cli + commons-cli + 1.2 + junit junit @@ -30,5 +35,57 @@ + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + + + maven-compiler-plugin + + 1.6 + 1.6 + UTF-8 + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-jar-plugin + + + + true + ./lib/ + us.codecraft.webmagic.scripts.ScriptConsole + + + + + + + \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java new file mode 100644 index 00000000..c982ea05 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -0,0 +1,147 @@ +package us.codecraft.webmagic.scripts; + +import com.google.common.collect.Sets; +import org.apache.commons.cli.*; +import us.codecraft.webmagic.Spider; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author code4crafter@gmail.com + * @since 0.4.1 + */ +public class ScriptConsole { + + private static class Params { + Language language = Language.JavaScript; + String scriptFileName; + List urls; + int thread = 1; + int sleepTime = 1000; + private static Map> alias = new HashMap>(); + + static { + alias.put(Language.JavaScript, Sets.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(Language.JRuby, Sets.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } + + private Language getLanguage() { + return language; + } + + private void setLanguage(Language language) { + this.language = language; + } + + private String getScriptFileName() { + return scriptFileName; + } + + private void setScriptFileName(String scriptFileName) { + this.scriptFileName = scriptFileName; + } + + private List getUrls() { + return urls; + } + + private void setUrls(List urls) { + this.urls = urls; + } + + private int getThread() { + return thread; + } + + private void setThread(int thread) { + this.thread = thread; + } + + private int getSleepTime() { + return sleepTime; + } + + private void setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + } + } + + public static void main(String[] args) { + Params params = parseCommand(args); + startSpider(params); + } + + private static void startSpider(Params params) { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() + .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).build(); + pageProcessor.getSite().setSleepTime(params.getSleepTime()); + Spider spider = Spider.create(pageProcessor).thread(params.getThread()); + if (params.getUrls() == null || params.getUrls().size() == 0) { + System.err.println("Need at least one argument"); + System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); + System.exit(-1); + } + for (String url : params.getUrls()) { + spider.addUrl(url); + } + spider.run(); + } + + private static Params parseCommand(String[] args) { + try { + Options options = new Options(); + options.addOption(new Option("l", true, "language")); + options.addOption(new Option("t", true, "thread")); + options.addOption(new Option("f", true, "script file")); + CommandLineParser commandLineParser = new PosixParser(); + CommandLine commandLine = commandLineParser.parse(options, args); + return readOptions(commandLine); + } catch (Exception e) { + e.printStackTrace(); + exit(); + return null; + } + } + + private static void exit() { + System.err.println("Format error"); + System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); + System.exit(-1); + } + + private static Params readOptions(CommandLine commandLine) { + Params params = new Params(); + if (commandLine.hasOption("l")) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } + if (commandLine.hasOption("f")) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } else { + exit(); + } + if (commandLine.hasOption("s")) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } + if (commandLine.hasOption("t")) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } + params.setUrls(commandLine.getArgList()); + return params; + } +} diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml new file mode 100644 index 00000000..c2b5a2f5 --- /dev/null +++ b/webmagic-scripts/src/main/resources/log4j.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + +