diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693..15206b92 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,7 +49,7 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; @@ -108,6 +108,7 @@ public class Page { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + @Deprecated public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc28619..b73665ab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -40,9 +40,9 @@ public class Request implements Serializable { /** * cookies for current url, if not set use Site's cookies */ - private Map cookies = new HashMap(); + private Map cookies = new HashMap<>(); - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); /** * Priority of the request.
@@ -94,7 +94,7 @@ public class Request implements Serializable { public Request putExtra(String key, T value) { if (extras == null) { - extras = new HashMap(); + extras = new HashMap<>(); } extras.put(key, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 488c81e7..273b0a30 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -14,7 +14,7 @@ import java.util.Map; */ public class ResultItems { - private Map fields = new LinkedHashMap(); + private Map fields = new LinkedHashMap<>(); private Request request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b282..9cbda022 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -22,9 +22,9 @@ public class Site { private String userAgent; - private Map defaultCookies = new LinkedHashMap(); + private Map defaultCookies = new LinkedHashMap<>(); - private Map> cookies = new HashMap>(); + private Map> cookies = new HashMap<>(); private String charset; @@ -38,11 +38,11 @@ public class Site { private int timeOut = 5000; - private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet<>(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); private boolean useGzip = true; @@ -83,7 +83,7 @@ public class Site { */ public Site addCookie(String domain, String name, String value) { if (!cookies.containsKey(domain)){ - cookies.put(domain,new HashMap()); + cookies.put(domain,new HashMap<>()); } cookies.get(domain).put(name, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index ce638375..92554814 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task { protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList<>(); protected PageProcessor pageProcessor; @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -248,7 +248,7 @@ public class Spider implements Runnable, Task { * @return this */ public Spider clearPipeline() { - pipelines = new ArrayList(); + pipelines = new ArrayList<>(); return this; } @@ -439,7 +439,6 @@ public class Spider implements Runnable, Task { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { @@ -545,7 +544,7 @@ public class Spider implements Runnable, Task { public T get(String url) { List urls = WMCollections.newArrayList(url); List resultItemses = getAll(urls); - if (resultItemses != null && resultItemses.size() > 0) { + if (resultItemses != null && !(resultItemses.isEmpty())) { return resultItemses.get(0); } else { return null; @@ -678,7 +677,7 @@ public class Spider implements Runnable, Task { public enum Status { - Init(0), Running(1), Stopped(2); + INIT(0), RUNNING(1), STOPPED(2); private Status(int value) { this.value = value; @@ -697,7 +696,7 @@ public class Spider implements Runnable, Task { } } //default value - return Init; + return INIT; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d0..a71a7d87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -32,7 +32,7 @@ public abstract class AbstractDownloader implements Downloader { */ public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); - return (Html) page.getHtml(); + return page.getHtml(); } protected void onSuccess(Request request) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e11..b2a39e91 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -33,7 +33,7 @@ public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); + private final Map httpClients = new HashMap<>(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b3078..afd6f88b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -16,6 +16,8 @@ import java.util.Map; */ public class HttpRequestBody implements Serializable { + private static final String ILL_ENC = "illegal encoding "; + private static final long serialVersionUID = 5659170945717023595L; public static abstract class ContentType { @@ -68,7 +70,7 @@ public class HttpRequestBody implements Serializable { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -76,7 +78,7 @@ public class HttpRequestBody implements Serializable { try { return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -92,7 +94,7 @@ public class HttpRequestBody implements Serializable { try { return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a46..85852bca 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -19,6 +19,7 @@ import java.nio.charset.Charset; */ public abstract class CharsetUtils { + private static final String CHR = "charset"; private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { @@ -40,9 +41,9 @@ public abstract class CharsetUtils { for (Element link : links) { // 2.1、html4.01 String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + String metaCharset = link.attr(CHR); + if (metaContent.indexOf(CHR) != -1) { + metaContent = metaContent.substring(metaContent.indexOf(CHR), metaContent.length()); charset = metaContent.split("=")[1]; break; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index 3f2de70c..e2759e56 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -19,6 +19,7 @@ import static us.codecraft.webmagic.selector.Selectors.xpath; */ public class ZipCodePageProcessor implements PageProcessor { + private Site site = Site.me().setCharset("gb2312") .setSleepTime(100); diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index d53630fe..4de3e01e 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -63,54 +63,47 @@ class WebDriverPool { * @throws IOException */ public void configure() throws IOException { - try{ - // Read config file - sConfig = new Properties(); - String configFile = DEFAULT_CONFIG_FILE; - if (System.getProperty("selenuim_config")!=null){ - configFile = System.getProperty("selenuim_config"); + + // Read config file + sConfig = new Properties(); + String configFile = DEFAULT_CONFIG_FILE; + if (System.getProperty("selenuim_config")!=null){ + configFile = System.getProperty("selenuim_config"); + } + sConfig.load(new FileReader(configFile)); + + // Prepare capabilities + sCaps = new DesiredCapabilities(); + sCaps.setJavascriptEnabled(true); + sCaps.setCapability("takesScreenshot", false); + + String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); + + // Fetch PhantomJS-specific configuration parameters + if (driver.equals(DRIVER_PHANTOMJS)) { + // "phantomjs_exec_path" + if (sConfig.getProperty("phantomjs_exec_path") != null) { + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, + sConfig.getProperty("phantomjs_exec_path")); + } else { + throw new IOException( + String.format( + "Property '%s' not set!", + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); } - FileReader configFileReader = new FileReader(configFile) - sConfig.load(configFileReader); - - // Prepare capabilities - sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); - sCaps.setCapability("takesScreenshot", false); - String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); - // Fetch PhantomJS-specific configuration parameters - if (driver.equals(DRIVER_PHANTOMJS)) { - // "phantomjs_exec_path" - if (sConfig.getProperty("phantomjs_exec_path") != null) { - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, - sConfig.getProperty("phantomjs_exec_path")); - } else { - throw new IOException( - String.format( - "Property '%s' not set!", - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); - } - // "phantomjs_driver_path" - if (sConfig.getProperty("phantomjs_driver_path") != null) { - System.out.println("Test will use an external GhostDriver"); - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, - sConfig.getProperty("phantomjs_driver_path")); - } else { - System.out - .println("Test will use PhantomJS internal GhostDriver"); - } + // "phantomjs_driver_path" + if (sConfig.getProperty("phantomjs_driver_path") != null) { + System.out.println("Test will use an external GhostDriver"); + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, + sConfig.getProperty("phantomjs_driver_path")); + } else { + System.out + .println("Test will use PhantomJS internal GhostDriver"); } - }catch(Exception e){ - throw new IOException("Can not load config file properly"); - - }finally{ - configFileReader.close(); } - - // Disable "web-security", enable all possible "ssl-protocols" and // "ignore-ssl-errors" for PhantomJSDriver // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new