HttpClient做爬虫时,常常需要发起大量请求,此时用多线程比较好。

java自带的线程池比较完善稳定。以此为例分享一段多线程请求及复用HttpClient的代码。

PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
        connectionManager.setMaxTotal(200);
        CloseableHttpClient httpClient = HttpClients.createDefault();
        RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(11000).build();
        ExecutorService executorService = Executors.newFixedThreadPool(40);
        Set<Callable<String>> callables = new HashSet<>();
        for (int i = 0; i < 100; i++) {
            int finalI = i;
            callables.add(new Callable<String>() {
                @Override
                public String call() throws Exception {
                    System.out.println("start" + finalI);
                    HttpGet httpGet = new HttpGet("http://www.baidu.com/" + finalI);
                    httpGet.setConfig(requestConfig);
                    CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
                    httpGet.releaseConnection();
                    System.out.println("end" + finalI);
                    return String.valueOf(finalI) + " over " + "status " + httpResponse.getStatusLine().getStatusCode();
                }
            });
        }

        List<Future<String>> result = null;
        try {
            result = executorService.invokeAll(callables);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        for (Future<String> stringFuture : result) {
            try {
                System.out.println(stringFuture.get());
            } catch (InterruptedException e) {
                e.printStackTrace();
            } catch (ExecutionException e) {
                e.printStackTrace();
            }
        }
        executorService.shutdown();