Pipeline & PageProcesser



public interface Downloader {

* Downloads web pages and store in Page object.
* @param request request
* @param task task
* @return page
public Page download(Request request, Task task);

* Tell the downloader how many threads the spider used.
* @param threadNum number of threads
public void setThread(int threadNum);


public class HttpClientDownloader extends AbstractDownloader {

private Logger logger = LoggerFactory.getLogger(getClass());

private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();

private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();

private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
if (site == null) {
return httpClientGenerator.getClient(null, proxy);
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site, proxy);
httpClients.put(domain, httpClient);
return httpClient;

public Page download(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = Sets.newHashSet(200);
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpHost proxyHost = null;
Proxy proxy = null; //TODO
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool();
proxyHost = proxy.getHttpHost();
} else if(site.getHttpProxy()!= null){
proxyHost = site.getHttpProxy();

HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);��֤
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
return page;
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
return null;
} catch (IOException e) {
logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
return null;
} finally {
request.putExtra(Request.STATUS_CODE, statusCode);
if (site.getHttpProxyPool()!=null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
} catch (IOException e) {
logger.warn("close response fail", e);

public void setThread(int thread) {

protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);

protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers,HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
if (proxy !=null) {
request.putExtra(Request.PROXY, proxy);
return requestBuilder.build();

protected RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
if (nameValuePair != null && nameValuePair.length > 0) {
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
throw new IllegalArgumentException("Illegal HTTP Method " + method);

protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setUrl(new PlainText(request.getUrl()));
return page;

protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes);
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);

protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;

其中包括了添加http proxy这部分官方文档都没有介绍,如果需要那就自行看源码吧- -b

if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
return page;
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
return null;

acceptStatCode默认是200,如果出现其他resultCode那么将会直接return null,也不会释放HttpClient的资源,也就是下面的finally块不会被执行。也算是一个bug吧

finally {
request.putExtra(Request.STATUS_CODE, statusCode);
if (site.getHttpProxyPool()!=null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Irequest
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
} catch (IOException e) {
logger.warn("close response fail", e);


