C# 关于爬取网站数据遇到csrf-token的分析与解决

需求

某航空公司物流单信息查询，是一个post请求。通过后台模拟POST HTTP请求发现无法获取页面数据，通过查看航空公司网站后，发现网站使用避免CSRF攻击机制，直接发挥40X错误。

关于CSRF

读者自行百度

网站HTTP请求分析

Headers

Form Data

在head里包含了cookie 与 x-csrf-token formdata 里包含了_csrf （与head里的值是一样的）.

这里通过查看该网站的JS源代码发现_csrf 来自于网页的head标签里

猜测cookie与 x-csrf-token是有一定的有效期，并且他们共同作用来防御CSRF攻击。

解决方案

1，首先请求一下该航空公司的网站，获取cookie与_csrf

2，然后C# 模拟http分别在head和formdata里加入如上参数，发起请求

代码

 public class CSRFToken

    {

        string cookie;//用于请求的站点的cookie

        List<string> csrfs;//用于请求站点的token的key 以及 value

        public CSRFToken(string url)

        {

            //校验传输安全

            if (!string.IsNullOrWhiteSpace(url))

            {

                try

                {

                    //设置请求的头信息.获取url的host

                    var _http = new HttpHelper(url);

                    string cookie;

                    string html = _http.CreateGetHttpResponseForPC(out cookie);

                    this.cookie = cookie;

                    string headRegex = @"<meta name=""_csrf.*"" content="".*""/>";

                    MatchCollection matches = Regex.Matches(html, headRegex);

                    Regex re = new Regex("(?<=content=\").*?(?=\")", RegexOptions.None);

                    csrfs = new List<string>();

                    foreach (Match math in matches)

                    {

                        MatchCollection mc = re.Matches(math.Value);

                        foreach (Match ma in mc)

                        {

                            csrfs.Add(ma.Value);

                        }

                    }

                }

                catch (Exception e)

                {

                }

            }

        }

        public String getCookie()

        {

            return cookie;

        }

        public void setCookie(String cookie)

        {

            this.cookie = cookie;

        }

        public List<string> getCsrf_token()

        {

            return csrfs;

        }

    }

httpHelper

  public string CreatePostHttpResponse(IDictionary<string, string> headers, IDictionary<string, string> parameters)

        {

            HttpWebRequest request = null;

            //HTTPSQ请求

            UTF8Encoding encoding = new System.Text.UTF8Encoding();

            ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);

            request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;

            request.ProtocolVersion = HttpVersion.Version10;

            ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;

            request.Method = "POST";

            request.ContentType = "application/x-www-form-urlencoded";

            // request.ContentType = "application/json";

            request.UserAgent = DefaultUserAgent;

            //request.Headers.Add("X-CSRF-TOKEN", "bc0cc533-60cc-484a-952d-0b4c1a95672c");

            //request.Referer = "https://www.asianacargo.com/tracking/viewTraceAirWaybill.do";

            //request.Headers.Add("Origin", "https://www.asianacargo.com");

            //request.Headers.Add("Cookie", "JSESSIONID=HP21d2Dq5FoSlG4Fyw4slWwHb0-Sl1CG6jGtj7HE41e5f4aN_R1p!-435435446!117330181");

            //request.Host = "www.asianacargo.com";

            if (!(headers == null || headers.Count == ))

            {

                foreach (string key in headers.Keys)

                {

                    request.Headers.Add(key, headers[key]);

                }

            }

            //如果需要POST数据

            if (!(parameters == null || parameters.Count == ))

            {

                StringBuilder buffer = new StringBuilder();

                int i = ;

                foreach (string key in parameters.Keys)

                {

                    if (i > )

                    {

                        buffer.AppendFormat("&{0}={1}", key, parameters[key]);

                    }

                    else

                    {

                        buffer.AppendFormat("{0}={1}", key, parameters[key]);

                    }

                    i++;

                }

                byte[] data = encoding.GetBytes(buffer.ToString());

                using (Stream stream = request.GetRequestStream())

                {

                    stream.Write(data, , data.Length);

                }

            }

            HttpWebResponse response;

            try

            {

                //获得响应流

                response = (HttpWebResponse)request.GetResponse();

                Stream s = response.GetResponseStream();

                StreamReader readStream = new StreamReader(s, Encoding.UTF8);

                string SourceCode = readStream.ReadToEnd();

                response.Close();

                readStream.Close();

                return SourceCode;

            }

            catch (WebException ex)

            {

                response = ex.Response as HttpWebResponse; return null;

            }

        }

   public string CreateGetHttpResponse(out string cookie)

        {

            HttpWebRequest request = null;

            //HTTPSQ请求

            UTF8Encoding encoding = new System.Text.UTF8Encoding();

            ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);

            request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;

            request.ProtocolVersion = HttpVersion.Version10;

            ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;

            request.Method = "GET";

            request.ContentType = "application/x-www-form-urlencoded";

            request.UserAgent = DefaultUserAgent;

            HttpWebResponse response;

            try

            {

                //获得响应流

                response = (HttpWebResponse)request.GetResponse();

                cookie = response.Headers["Set-Cookie"];

                Stream s = response.GetResponseStream();

                StreamReader readStream = new StreamReader(s, Encoding.UTF8);

                string SourceCode = readStream.ReadToEnd();

                response.Close();

                readStream.Close();

                return SourceCode;

            }

            catch (WebException ex)

            {

                response = ex.Response as HttpWebResponse;

                cookie = "";

                return null;

            }

        }

爬取程序

爬取结果

浏览器结果

注意事项与结论

1，不同的网站，获取cstf的方式不一样，无论怎么做，只要信息传到前台我们都可以有相应的方法来获取。

2，请求时候的http验证可能不一样，测试的某航空公司物流信息的时候，http请求的安全协议是tis12。

ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11; 还有其他参数比如UserAgent后台可能也会验证

3,基于如上航空公司，发现它的cookie和cstf_token一定时间内不会改变，那么当实际爬取的时候可以考虑缓存cookie以及cstf_token,只有当请求失败的时候，才重新获取