dotNet使用HttpWebRequest模拟浏览器
在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。
目前HttpHelper包含了以下几个方面:
- GetHttpContent:通过Get或Post来获取网页的Html
- SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
- GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
- ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
- BuildPostData:通过一个需要post的html构建出postdata
代码如下:
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Net;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using HtmlAgilityPack; namespace TNIdea.Common.Helper
{
public class HttpHelper
{
public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^\s"">]+)""?)"; /// <summary>
/// 获取网页的内容
/// </summary>
/// <param name="url">Url</param>
/// <param name="postData">Post的信息</param>
/// <param name="cookies">Cookies</param>
/// <param name="userAgent">浏览器标识</param>
/// <param name="referer">来源页</param>
/// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param>
/// <param name="encode">编码方式,用于解析html</param>
/// <returns></returns>
public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null)
{
try
{
HttpWebResponse httpResponse = null;
if (!string.IsNullOrWhiteSpace(postData))
httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer);
else
httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); #region 根据Html头判断
string Content = null;
//缓冲区长度
const int N_CacheLength = ;
//头部预读取缓冲区,字节形式
var bytes = new List<byte>();
int count = ;
//头部预读取缓冲区,字符串
String cache = string.Empty; //创建流对象并解码
Stream ResponseStream;
switch (httpResponse.ContentEncoding.ToUpperInvariant())
{
case "GZIP":
ResponseStream = new GZipStream(
httpResponse.GetResponseStream(), CompressionMode.Decompress);
break;
case "DEFLATE":
ResponseStream = new DeflateStream(
httpResponse.GetResponseStream(), CompressionMode.Decompress);
break;
default:
ResponseStream = httpResponse.GetResponseStream();
break;
} try
{
while (
!(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase)
|| count >= N_CacheLength))
{
var b = (byte)ResponseStream.ReadByte();
if (b < ) //end of stream
{
break;
}
bytes.Add(b); count++;
cache += (char)b;
} if (encode == null)
{
try
{
if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn")
{
Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (match.Success)
{
try
{
string charset = match.Groups["Charset"].Value;
encode = Encoding.GetEncoding(charset);
}
catch { }
}
else
encode = Encoding.GetEncoding("GB2312");
}
else
encode = Encoding.GetEncoding(httpResponse.CharacterSet);
}
catch { }
} //缓冲字节重新编码,然后再把流读完
var Reader = new StreamReader(ResponseStream, encode);
Content = encode.GetString(bytes.ToArray(), , count) + Reader.ReadToEnd();
Reader.Close();
}
catch (Exception ex)
{
return ex.ToString();
}
finally
{
httpResponse.Close();
}
#endregion 根据Html头判断 //获取返回的Cookies,支持httponly
if (string.IsNullOrWhiteSpace(cookiesDomain))
cookiesDomain = httpResponse.ResponseUri.Host; cookies = new CookieContainer();
CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);
cookies.Add(httpHeaderCookies ?? httpResponse.Cookies); return Content;
}
catch
{
return string.Empty;
}
} /// <summary>
/// 创建GET方式的HTTP请求
/// </summary>
/// <param name="url"></param>
/// <param name="timeout"></param>
/// <param name="userAgent"></param>
/// <param name="cookies"></param>
/// <param name="referer"></param>
/// <returns></returns>
public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = , string userAgent = "", CookieContainer cookies = null, string referer = "")
{
HttpWebRequest request = null;
if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
//对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
request = WebRequest.Create(url) as HttpWebRequest;
//request.ProtocolVersion = HttpVersion.Version10; //http版本,默认是1.1,这里设置为1.0
}
else
{
request = WebRequest.Create(url) as HttpWebRequest;
} request.Referer = referer;
request.Method = "GET"; //设置代理UserAgent和超时
if (string.IsNullOrWhiteSpace(userAgent))
userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36"; request.UserAgent = userAgent;
request.Timeout = timeout;
request.KeepAlive = true;
request.AllowAutoRedirect = true; if (cookies == null)
cookies = new CookieContainer();
request.CookieContainer = cookies; return request.GetResponse() as HttpWebResponse;
} /// <summary>
/// 创建POST方式的HTTP请求
/// </summary>
/// <param name="url"></param>
/// <param name="postData"></param>
/// <param name="timeout"></param>
/// <param name="userAgent"></param>
/// <param name="cookies"></param>
/// <param name="referer"></param>
/// <returns></returns>
public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = , string userAgent = "", CookieContainer cookies = null, string referer = "")
{
HttpWebRequest request = null;
//如果是发送HTTPS请求
if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
request = WebRequest.Create(url) as HttpWebRequest;
//request.ProtocolVersion = HttpVersion.Version10;
}
else
{
request = WebRequest.Create(url) as HttpWebRequest;
}
request.Referer = referer;
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded"; //设置代理UserAgent和超时
if (string.IsNullOrWhiteSpace(userAgent))
request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
else
request.UserAgent = userAgent;
request.Timeout = timeout;
request.KeepAlive = true;
request.AllowAutoRedirect = true; if (cookies == null)
cookies = new CookieContainer();
request.CookieContainer = cookies; //发送POST数据
if (!string.IsNullOrWhiteSpace(postData))
{
byte[] data = Encoding.UTF8.GetBytes(postData);
request.ContentLength = data.Length;
using (Stream stream = request.GetRequestStream())
{
stream.Write(data, , data.Length);
}
}
//string[] values = request.Headers.GetValues("Content-Type");
return request.GetResponse() as HttpWebResponse;
} /// <summary>
/// 验证证书
/// </summary>
/// <param name="sender"></param>
/// <param name="certificate"></param>
/// <param name="chain"></param>
/// <param name="errors"></param>
/// <returns>是否验证通过</returns>
private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
{
if (errors == SslPolicyErrors.None)
return true;
return false;
} /// <summary>
/// 根据response中头部的set-cookie对request中的cookie进行设置
/// </summary>
/// <param name="setCookie">The set cookie.</param>
/// <param name="defaultDomain">The default domain.</param>
/// <returns></returns>
private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)
{
try
{
string[] setCookie = response.Headers.GetValues("Set-Cookie"); // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.
List<string> a = new List<string>(setCookie);
for (int i = setCookie.Length - ; i > ; i--)
{
if (a[i].Substring(a[i].Length - ) == "GMT")
{
a[i - ] = a[i - ] + ", " + a[i];
a.RemoveAt(i);
i--;
}
}
setCookie = a.ToArray<string>();
CookieCollection cookies = new CookieCollection();
foreach (string str in setCookie)
{
NameValueCollection hs = new NameValueCollection();
foreach (string i in str.Split(';'))
{
int index = i.IndexOf("=");
if (index > )
hs.Add(i.Substring(, index).Trim(), i.Substring(index + ).Trim());
else
switch (i)
{
case "HttpOnly":
hs.Add("HttpOnly", "True");
break;
case "Secure":
hs.Add("Secure", "True");
break;
}
}
Cookie ck = new Cookie();
foreach (string Key in hs.AllKeys)
{
switch (Key.ToLower().Trim())
{
case "path":
ck.Path = hs[Key];
break;
case "expires":
ck.Expires = DateTime.Parse(hs[Key]);
break;
case "domain":
ck.Domain = hs[Key];
break;
case "httpOnly":
ck.HttpOnly = true;
break;
case "secure":
ck.Secure = true;
break;
default:
ck.Name = Key;
ck.Value = hs[Key];
break;
}
}
if (ck.Domain == "") ck.Domain = defaultDomain;
if (ck.Name != "") cookies.Add(ck);
}
return cookies;
}
catch
{
return null;
}
} /// <summary>
/// 遍历CookieContainer
/// </summary>
/// <param name="cookieContainer"></param>
/// <returns>List of cookie</returns>
public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)
{
Dictionary<string, string> cookies = new Dictionary<string, string>(); Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |
System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { }); foreach (string pathList in table.Keys)
{
StringBuilder _cookie = new StringBuilder();
SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField
| System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });
foreach (CookieCollection colCookies in cookieColList.Values)
foreach (Cookie c in colCookies)
_cookie.Append(c.Name + "=" + c.Value + ";"); cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));
}
return cookies;
} /// <summary>
/// convert cookies string to CookieContainer
/// </summary>
/// <param name="cookies"></param>
/// <returns></returns>
public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)
{
CookieContainer cookieContainer = new CookieContainer(); foreach (var cookie in cookies)
{
string[] strEachCookParts = cookie.Value.Split(';');
int intEachCookPartsCount = strEachCookParts.Length; foreach (string strCNameAndCValue in strEachCookParts)
{
if (!string.IsNullOrEmpty(strCNameAndCValue))
{
Cookie cookTemp = new Cookie();
int firstEqual = strCNameAndCValue.IndexOf("=");
string firstName = strCNameAndCValue.Substring(, firstEqual);
string allValue = strCNameAndCValue.Substring(firstEqual + , strCNameAndCValue.Length - (firstEqual + ));
cookTemp.Name = firstName;
cookTemp.Value = allValue;
cookTemp.Path = "/";
cookTemp.Domain = cookie.Key;
cookieContainer.Add(cookTemp);
}
}
}
return cookieContainer;
} public static string BuildPostData(string htmlContent)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlContent);
//Get the form node collection.
HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");
HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input"); StringBuilder postData = new StringBuilder(); foreach (HtmlNode input in htmlInputs)
{
if(input.Attributes["value"] != null)
postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");
}
return postData.ToString().TrimEnd('&');
}
}
}
部分网站需要登录的问题我已经着手通过另一个项目来解决(imitate-login),目前还有许多网页使用了JavaScript或各种基于JS的框架来对网页进行数据加载,如何来模拟执行JavaScript暂时还没找到比较优美的解决方案,如果大家有什么好的方案可以发给我,谢谢!
未经授权,拒绝任何全文及摘要转载!
dotNet使用HttpWebRequest模拟浏览器的更多相关文章
- HttpWebRequest 模拟浏览器访问网站
最近抓网页时报错: 要么返回 The remote server returned an error: (442)要么返回: 非法访问,您的行为已被WAF系统记录! 想了想,就当是人家加了抓网页的东西 ...
- HttpWebRequest模拟c#网站登录
用户名 密码 模拟登录asp.net开发的网站 关心两个问题:通过控件属性获取数据.响应事件. 上面是一个普通的asp.net表单.输入用户名.密码后,点击按钮将会进入各自绑定的后台函数,而不仅仅是 ...
- 使用HttpWebRequest模拟登陆阿里巴巴(alibaba、httpwebrequest、login)
前言 其实老喜欢取经,偶尔也得分享下.关于阿里巴巴国际站的登陆,过程有点复杂但是算不上难.一不小心少个东西倒也挺麻烦的. 主要是看下请求类HttpClient基本请求封装使用,AliClient模拟浏 ...
- java 接口中模拟浏览器 请求webservice 接受返回数据
使用HttpClient 所需jar:commons-codec-1.9.jar,commons-httpclient-3.1.jar try { HttpClient client = new Ht ...
- .net后台模拟浏览器get/post请求
#region 后台模拟浏览器get/post请求 /// <summary> /// 发送请求方式 /// </summary> /// <param name=&qu ...
- php中curl模拟浏览器来传输数据
cURL可以使用URL的语法模拟浏览器来传输数据, 因为它是模拟浏览器,因此它同样支持多种协议,FTP, FTPS, HTTP, HTTPS, GOPHER, TELNET, DICT, FILE 以 ...
- httpclient模拟浏览器get\post
一般的情况下我们都是使用IE或者Navigator浏览器来访问一个WEB服务器,用来浏览页面查看信息或者提交一些数据等等.所访问的这些页面有的仅 仅是一些普通的页面,有的需要用户登录后方可使用,或者需 ...
- curl模拟浏览器进行phpQuery抓取数据
报Warning: file_get_contents(http://www.dianping.com/shop/8042874) [function.file-get-contents]: fail ...
- python 模拟浏览器
想用python模拟浏览器访问web的方法测试些东西,有哪几种方法呢? 一类:单纯的访问web,不解析其js,css等. 1. urllib2 #-*- coding:utf-8 -* import ...
随机推荐
- Hack语言类型化简介
在typechecker的配合下,Hack语言的类型化能力是Hack其他功能特性的基石.开发Hack语言的主要动机也正是为代码提供显式类型标注以便对代码进行类型一致性和潜在错误分析. 这是用于对比Ha ...
- springmvc(4)注解简单了解
对于我这样的新人来说,因为是刚开始做项目,所以以前的技术不是用的很多,就比如springmvc来说,实际上使用的都是注解形式的,对于那些全部都是配置的来说,虽然也了解一些,但是实际上还是没试用过的. ...
- IOS----友盟推送详解
这两天好好的研究了下推送这功能,关于它我将分成两部分来讲,一.IOS手机端,二.Servlet服务端,今天先讲下IOS端 一.感受 下面讲下我对推送这个功能在IOS下的感受,这个算是我做了服务端的功能 ...
- Mac 连接阿里云服务器
1. 通过命令行连接 Server 并设置 1.1 连接 Server #: ssh root@hctec.top ssh: 远程连接工具 root: 远程服务器用户名, 此处我用的是: root 用 ...
- 在腾讯云上面搭建WordPress博客网站
一.准备工作 我们使用腾讯云服务器来运行WordPress,腾讯云会给新用户七天的产品体验时间还有一些代金券,做为新手尝试成本是比较小的. 腾讯云长期举办 "云+校园" 活动,学生 ...
- 探秘Java中的String、StringBuilder以及StringBuffer
探秘Java中String.StringBuilder以及StringBuffer 相信String这个类是Java中使用得最频繁的类之一,并且又是各大公司面试喜欢问 到的地方,今天就来和大家一起学习 ...
- 在Java中如何实现“Pless presss any key to continue.”
import java.util.*; class Continue{ public static void main(String[] args){ Scanner scanner=new Scan ...
- [DeviceOne开发]-白板的示例
一.简介 该demo通过do_Painterview这个组件实现画板的基本功能,模仿的是Appstore上的叫“白板”的应用,可以更改字体颜色,字体粗细,然后用手指进行绘制,可以回退,清屏,保存到相册 ...
- 关于WPF中文件夹浏览对话框的方式
文件夹浏览时dialogresult要写全引用路径 string path=null; FolderBrowserDialog fbd = new FolderBrowserDialog(); fbd ...
- word开发遇到的问题
1.系统不能安装多个office word版本,建议只安装一个2003版本,越完整越好. 2.安装时候ghost系统会遇到问题,由于很多组件没有完整的安装,因此缺少了很多安装时文件保护要进行提醒的dl ...