今天接到一个活,需要统计人员的工号信息,由于种种原因不能直接连数据库 [无奈]、[无奈]、[无奈]。采取迂回方案,写个工具自动登录网站,采集用户信息。

这也不是第一次采集ASP.NET网站,以前采集的时候就知道,这种网站采集比较麻烦,尤其是WebForm的ASP.NET 网站,那叫一个费劲。

喜欢现在流行的Restful模式的网站,数据接口采集那才叫舒服。

闲话少说,开干

工作量不大,HTTP纯手写

先准备下一个GET/POST预备使用

  1.     public static string Get(string url, Action<string> SuccessCallback, Action<string> FailCallback) {
  2. HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
  3. req.Method = "GET";
  4. req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
  5. req.Accept = "*/*";
  6. req.KeepAlive = true;
  7. req.ServicePoint.ConnectionLimit = int.MaxValue;
  8. req.ServicePoint.Expect100Continue = false;
  9. req.CookieContainer = sznyCookie; #静态变量
  10. req.Credentials = System.Net.CredentialCache.DefaultCredentials;
  11. string msg = "";
  12. using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
  13. {
  14. using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
  15. {
  16. msg = reader.ReadToEnd();
  17. }
  18. }
  19. return msg;
  20. }
  21.  
  22.     public static string Post(string url, Dictionary<string, string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) {
  23. StringBuilder data = new StringBuilder();
  24. foreach (var kv in dicParms) {
  25. if (kv.Key.StartsWith("header"))
  26. continue;
  27. data.Append($"&{Common.UrlEncode( kv.Key,Encoding.UTF8)}={ Common.UrlEncode( kv.Value,Encoding.UTF8)}");
  28. }
  29. if (data.Length > )
  30. data.Remove(, );
  31. HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
  32. req.Method = "POST";
  33. req.KeepAlive = true;
  34. req.CookieContainer = sznyCookie;
  35. req.Connection = "KeepAlive";
  36. req.KeepAlive = true;
  37. req.ContentType = "application/x-www-form-urlencoded";
  38. req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
  39. req.Referer = url;
  40. if (dicParms.ContainsKey("ScriptManager1"))
  41. {
  42. req.Headers.Add("X-MicrosoftAjax", "Delta=true");
  43. req.Headers.Add("X-Requested-With", "XMLHttpRequest");
  44. req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
  45. req.Accept = "*/*";
  46. }
  47. req.Headers.Add("Cache-Control", "no-cache");
  48.  
  49. req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
  50. req.ServicePoint.ConnectionLimit = int.MaxValue;
  51. req.ServicePoint.Expect100Continue = false;
  52. req.AllowAutoRedirect = true;
  53. req.Credentials = System.Net.CredentialCache.DefaultCredentials;
  54.  
  55. byte[] buffer = Encoding.UTF8.GetBytes(data.ToString());
  56. using (Stream reqStream = req.GetRequestStream())
  57. {
  58. reqStream.Write(buffer, , buffer.Length);
  59. }
  60. string msg = "";
  61. using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
  62. {
  63. using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
  64. {
  65.  
  66. msg = reader.ReadToEnd();
  67. if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx"))
  68. {
  69. //登录失败
  70. if (FailCallback != null)
  71. FailCallback(msg);
  72. }
  73. else {
  74. if (SuccessCallback!=null)
  75. SuccessCallback(msg);
  76. }
  77. }
  78. }
  79. return msg;
  80. }

整个过程分为登陆、用户信息列表、用户信息详情,分三步走来完成这个项目

登陆

根据Chrome抓包结果编写Login,帐号密码没有任何加密,直接明文显示了,直接用了,根据是否跳转页面判断是否登陆成功。调试查看结果登陆成功了。

根据上面的抓包数据,可以调用下面的代码确定是否登陆成功。

  1.       public static bool SznyLogin(string username, string password, Action<string> SuccessCallback, Action<string> FailCallback) {
  2. string url = "http://127.0.0.1/login.aspx";
  3. string msg = Get(url, SuccessCallback, FailCallback);
  4. if (msg.Trim().Length > ) {
  5. Dictionary<string, string> dicParms = new Dictionary<string, string>();
  6. dicParms.Add("__VIEWSTATE", "");
  7. dicParms.Add("__EVENTVALIDATION", "");
  8. dicParms.Add("Text_Name", "");
  9. dicParms.Add("Text_Pass", "");
  10. dicParms.Add("btn_Login.x", new Random().Next().ToString());
  11. dicParms.Add("btn_Login.y", new Random().Next().ToString());
  12. MatchCollection mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  13. foreach (Match mi in mc)
  14. {
  15. if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
  16. dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
  17. }
  18. dicParms["Text_Name"] = username;
  19. dicParms["Text_Pass"] = password;
  20.  
  21. msg=Post(url, dicParms, SuccessCallback, FailCallback);
  22.  
  23. if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx"))
  24. {
  25. return false;
  26. }
  27. else
  28. return true;
  29. }
  30. return false;
  31. }

抓取人员信息

看到下面这个页面,失望了,列表上没有工号,如果列表上有工号 设置一页显示全部信息就可以把所有的数据都抓取到了。

换个思路:是不是我直接设置一页显示所有的数据后,然后根据员工ID可以获取到所有的信息呢?

接下来点击任意一条信息后,查看详情,显示下面的调用结果。Url上没有ID,Get这条路走不通了,查看Post的数据,更失望,没有ID,通过行信息绑定。传统的WebForm 提交模式…

把所有的数据显示到一页,把列表的数据先采集完,然后最后一个页面一个页面的采集工号信息。

  1.     public static CookieContainer sznyCookie = new CookieContainer();
  2. /// <summary>
  3. /// 员工信息
  4. /// </summary>
  5. public static Dictionary<int, Dictionary<string,string>> dicSznyEmployees = new Dictionary<int, Dictionary<string, string>>();
  6.  
  7. public static Dictionary<string, string> dicSznyEmployeeParms = new Dictionary<string, string>();
  8. /// <summary>
  9. /// 人员顺序号
  10. /// </summary>
  11. public static ConcurrentQueue<int> queueSznyEmployeeInfo = new ConcurrentQueue<int>();
  12.  
  13. public static ConcurrentQueue<int> queueSuccessEmployeeInfo = new ConcurrentQueue<int>();
  14.  
  15.     public static bool SznyEmployeeList(Action<string> SuccessCallback, Action<string> FailCallback)
  16. {
  17. string url = $"http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx";
  18. string msg = Get(url, SuccessCallback, FailCallback);
  19. if (msg.Trim().Length > )
  20. {
  21. //统计参数
  22. //__doPostBack\('(?<name>[^']*?)'
  23. //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  24. string name = "";
  25. MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  26. foreach (Match mi in mc)
  27. {
  28. name = mi.Groups["name"].Value.Trim();
  29. break;
  30. }
  31. //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>条/页)
  32. //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>条/页)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  33. string smname = "";
  34. Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>条/页)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  35. if (m.Success)
  36. smname = m.Value.Trim().Replace("'", "").Replace("'", "");
  37.  
  38. //<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>
  39. //new Regex(@"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  40. Dictionary<string, string> dicParms = new Dictionary<string, string>();
  41. dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}");
  42. dicParms.Add("__EVENTTARGET", smname);
  43. dicParms.Add("__EVENTARGUMENT", "");
  44. dicParms.Add("__VIEWSTATE", "");
  45. dicParms.Add("__EVENTVALIDATION", "");
  46. dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
  47. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "");
  48. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "");
  49. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "");
  50. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "");
  51. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "");
  52. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "");
  53. dicParms.Add("XM", "ZXMCHECK");
  54.  
  55. List<string> lstParms = new List<string>() { "XM", "MdGridView_t_unitemployees_dwyg_iCurrentPage", "MdGridView_t_unitemployees_dwyg_GridViewID", "MdGridView_t_unitemployees_dwyg_iCurrentNum", "MdGridView_t_unitemployees_dwyg_iPageCount", "MdGridView_t_unitemployees_dwyg_iPageSize", "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd", "MdGridView_t_unitemployees_dwyg_GridViewID", "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" };
  56. mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  57. foreach (Match mi in mc)
  58. {
  59.  
  60. if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
  61. continue;
  62.  
  63. if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
  64. dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
  65. else
  66. dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
  67. }
  68. if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText"))
  69. dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "";
  70. else
  71. dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "");//1200条 每页
  72.  
  73. msg = Post(url, dicParms, SuccessCallback, FailCallback);
  74.  
  75. dicSznyEmployees.Clear();
  76. dicSznyEmployeeParms.Clear();
  77. dicSznyEmployeeParms.Clear();
  78. dicSznyEmployeeParms.Add("__EVENTTARGET", "");
  79. dicSznyEmployeeParms.Add("__EVENTARGUMENT", "");
  80. dicSznyEmployeeParms.Add("__VIEWSTATE", dicParms["__VIEWSTATE"]);
  81. dicSznyEmployeeParms.Add("__EVENTVALIDATION", dicParms["__EVENTVALIDATION"]);
  82. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
  83. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "");
  84. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "");
  85. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "");
  86. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "");
  87. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "");
  88. dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "");
  89. dicSznyEmployeeParms.Add("XM", "ZXMCHECK");
  90.  
  91. lstParms.Clear();
  92. lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", };
  93. lstParms.Add("Button_edit");
  94. lstParms.Add("Button_out");
  95. lstParms.Add("ImageButton_Tx");
  96. lstParms.Add("ImageButton_xx1");
  97. lstParms.Add("Button_qd");
  98. lstParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID");
  99. lstparms.add("mdgridview_t_unitemployees_dwyg_icurrentpage");
  100. lstparms.add("mdgridview_t_unitemployees_dwyg_itotalpage");
  101. lstparms.add("mdgridview_t_unitemployees_dwyg_itotalcount");
  102. lstparms.add("mdgridview_t_unitemployees_dwyg_ipagesize");
  103. lstparms.add("mdgridview_t_unitemployees_dwyg_ipagecount");
  104. lstParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum");
  105.  
  106. mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  107. foreach (Match mi in mc)
  108. {
  109.  
  110. if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
  111. continue;
  112.  
  113. if (dicSznyEmployeeParms.ContainsKey(mi.Groups["name"].Value.Trim()))
  114. dicSznyEmployeeParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
  115. else
  116. dicSznyEmployeeParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
  117. }
  118. int cnt = int.Parse(dicSznyEmployeeParms["MdGridView_t_unitemployees_dwyg_iTotalCount"]);
  119. for (int i = ; i <= cnt; i++)
  120. queueSznyEmployeeInfo.Enqueue(i);
  121.  
  122. //获取TR
  123. //< tr[^<>] *? name = "SelectTR"[^<>] *?>.*?</ tr >
  124. //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  125. mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  126. foreach (Match mi in mc)
  127. {
  128. //获取td
  129. //(?<=<td[^<>]*?>).*?(?=</td>)
  130. //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  131. MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  132. int ix = int.Parse(mic[].Value.Trim());
  133. if (!dicSznyEmployees.ContainsKey(ix))
  134. {
  135. dicSznyEmployees.Add(ix, new Dictionary<string, string>());
  136. }
  137. queueSznyEmployeeInfo.Enqueue(ix);
  138.  
  139. dicSznyEmployees[ix].Add("UserName", mic[].Value.Trim().Replace("&nbsp;", ""));
  140. dicSznyEmployees[ix].Add("PersonID", mic[].Value.Trim().Replace("&nbsp;", ""));
  141. dicSznyEmployees[ix].Add("Birthday", mic[].Value.Trim().Replace("&nbsp;", ""));
  142. dicSznyEmployees[ix].Add("Sex", mic[].Value.Trim().Replace("&nbsp;", ""));
  143. dicSznyEmployees[ix].Add("HomePhone", mic[].Value.Trim().Replace("&nbsp;", ""));
  144. dicSznyEmployees[ix].Add("TelPhone", mic[].Value.Trim().Replace("&nbsp;", ""));
  145. dicSznyEmployees[ix].Add("Mail", mic[].Value.Trim().Replace("&nbsp;", ""));
  146. dicSznyEmployees[ix].Add("Address", mic[].Value.Trim().Replace("&nbsp;", ""));
  147. dicSznyEmployees[ix].Add("MinZu", mic[].Value.Trim().Replace("&nbsp;", ""));
  148. dicSznyEmployees[ix].Add("AddressJiGuan", mic[].Value.Trim().Replace("&nbsp;", ""));
  149. dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[].Value.Trim().Replace("&nbsp;", ""));
  150. dicSznyEmployees[ix].Add("Paiqianshijian", mic[].Value.Trim().Replace("&nbsp;", ""));
  151. dicSznyEmployees[ix].Add("Remark", mic[].Value.Trim().Replace("&nbsp;", ""));
  152. }
  153. }
  154. return true;
  155. }

这样所有的人员信息一次性采集到静态变量字典中了,剩下的一个工号可以慢慢获取了。

既然是这样,老实的分析Post数据,按照格式Post数据把。

分析完Post的数据后,突发奇想,我是不是可以通过相同的__ViewState和__EVENTVALIDATION POST数据呢?说干就干。

写代码跳转到员工列表页面,然后POST数据设置一页显示所有数据。

所有的POST的参数,保存到一个静态变量中。

发现POST批量提交的时候,前3次正常,以后就直接未登录。

果断放弃,换思路。

那如果这样不行 可不可以把所有的数据放到一个页面上,然后每次获取一次页面,然后根据顺序号POST数据呢。

上面已经把所有的列表数据都采集完了,顺序号也固定了,然后在POST数据的时候,发现有的人员和工号不对应。

这时候去分析为什么数据会出现不对应的情况呢?发现正则表达式写的还有问题。获取页面的Input的时候,属性有可能使用双引号,也有可能使用单引号。

正则表达式由原来的

  1. <input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>

修改为

  1. <input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?>

由于网站异步提交,也就是以前WEBForm采用的ScriptManager,提交的时候返回的HTML不是整个Document,没有注意,以为没有返回__ViewState。所以采用GET的时候获取的__ViewState继续执行获取工号的操作。发现获取的工号都是错误,人员与工号对不上

麻爪了,不知道该咋办了。犹豫了一下后,上Fiddler吧,一点点的看提交的参数是否有区别。发现正常网站在Get到页面后,通过调整每页x条数据后,提交的ViewState与原来的不一致。寻寻觅觅 觅觅寻寻  最后发现异步返回的HTML中,最后有ViewState….

由于返回的数据顺序,每次也不一样,也是造成人员、工号不一致的原因。

提交后正常了,但是1000多条的员工信息,每次提交都是2000多个参数。看着冗长的POST数据,无语了。这样提交 先不说网站本身就慢。我提交这么多网站会不会更慢,我的系统是不是也会更慢。

怎么办?

是不是有可能把分页设置成每页只有一条数据,然后每次翻页,采集数据。简单试试把

先修改获取列表页面数据,把数据设置成一条每页,此时不再采集列表中的信息。而是记录总共多少页,放入队列中,共定时任务去分页采集数据。列表信息通过后面的分页数据采集。

由于网站是内部系统,为了不影响系统的正常运行,每次只采集一条信息,等待这条信息采集完成后,在采集下一页信息。

采集列表

  1.      public static void ReqSznyEmployeeList(int ix,Action<string> SuccessCallback, Action<string> FailCallback)
  2. {
  3.  
  4. string url = $"http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx";
  5. HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
  6. req.Method = "GET";
  7. req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
  8. req.Accept = "*/*";
  9. req.KeepAlive = true;
  10. req.ServicePoint.ConnectionLimit = int.MaxValue;
  11. req.ServicePoint.Expect100Continue = false;
  12. req.CookieContainer = sznyCookie;
  13. req.Credentials = System.Net.CredentialCache.DefaultCredentials;
  14.  
  15. req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeList), new object[] { req, url,ix, SuccessCallback, FailCallback });
  16. }
  17.  
  18.      private static void RspSznyEmployeeList(IAsyncResult result)
  19. {
  20. object[] parms = result.AsyncState as object[];
  21. HttpWebRequest req = parms[] as HttpWebRequest;
  22. string url = parms[].ToString();
  23. int ix = int.Parse(parms[].ToString());
  24. Action<string> SuccessCallback = parms[] as Action<string>;
  25. Action<string> FailCallback = parms[] as Action<string>;
  26. try
  27. {
  28. using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse)
  29. {
  30. using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
  31. {
  32. string msg = "";
  33. msg = reader.ReadToEnd();
  34. //统计参数
  35. //__doPostBack\('(?<name>[^']*?)'
  36. //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  37. string name = "";
  38. MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  39. foreach (Match mi in mc)
  40. {
  41. name = mi.Groups["name"].Value.Trim();
  42. break;
  43. }
  44.  
  45. //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>条/页)
  46. //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>条/页)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  47. string smname = "MdGridView_t_unitemployees_dwyg$_SearchGo";
  48. //Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>条/页)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  49. //<input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?>
  50. //new Regex(@"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  51. Dictionary<string, string> dicParms = new Dictionary<string, string>();
  52. dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}");
  53. dicParms.Add("__EVENTTARGET", smname);
  54. dicParms.Add("__EVENTARGUMENT", "");
  55. dicParms.Add("__VIEWSTATE", "");
  56. dicParms.Add("__EVENTVALIDATION", "");
  57. dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
  58. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString());
  59. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "");
  60. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "");
  61. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "");
  62. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "");
  63. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "");
  64. //dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", ix.ToString());
  65.  
  66. dicParms.Add("XM", "ZXMCHECK");
  67. List<string> lstParms = new List<string>() { "ScriptManager1", "XM", "MdGridView_t_unitemployees_dwyg_iCurrentNum", "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd", "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" };
  68. mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  69. foreach (Match mi in mc)
  70. {
  71.  
  72. if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
  73. continue;
  74.  
  75. if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
  76. dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
  77. else
  78. dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
  79. }
  80. if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText"))
  81. dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "";
  82. else
  83. dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "");//1200条 每页
  84. if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageCount"))
  85. dicParms["MdGridView_t_unitemployees_dwyg_iPageCount"] = "";
  86. else
  87. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "");
  88. if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageSize"))
  89. dicParms["MdGridView_t_unitemployees_dwyg_iPageSize"] = "";
  90. else
  91. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "");
  92.  
  93. if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_SearchTextBox"))
  94. dicParms["MdGridView_t_unitemployees_dwyg$_SearchTextBox"] = $"{ix}";
  95. else
  96. dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", $"{ix}");/*第几页*/
  97.  
  98. dicParms["MdGridView_t_unitemployees_dwyg_iTotalPage"] = dicParms["MdGridView_t_unitemployees_dwyg_iTotalCount"];
  99. msg = Post(url, dicParms, SuccessCallback, FailCallback);
  100.  
  101. //获取TR
  102. //<tr[^<>]*?name="SelectTR"[^<>]*?>.*?</tr>
  103. //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  104. mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  105. foreach (Match mi in mc)
  106. {
  107. //获取td
  108. //(?<=<td[^<>]*?>).*?(?=</td>)
  109. //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  110. MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  111. if (!dicSznyEmployees.ContainsKey(ix))
  112. {
  113. dicSznyEmployees.Add(ix, new Dictionary<string, string>());
  114. }
  115. //queueSznyEmployeeInfo.Enqueue(ix);
  116.  
  117. if (!dicSznyEmployees[ix].ContainsKey("UserName"))
  118. dicSznyEmployees[ix].Add("UserName", mic[].Value.Trim().Replace("&nbsp;", ""));
  119. else
  120. dicSznyEmployees[ix]["UserName"] = mic[].Value.Trim().Replace("&nbsp;", "");
  121. if (!dicSznyEmployees[ix].ContainsKey("PersonID"))
  122. dicSznyEmployees[ix].Add("PersonID", mic[].Value.Trim().Replace("&nbsp;", ""));
  123. else
  124. dicSznyEmployees[ix]["PersonID"] = mic[].Value.Trim().Replace("&nbsp;", "");
  125.  
  126. if (!dicSznyEmployees[ix].ContainsKey("Birthday"))
  127. dicSznyEmployees[ix].Add("Birthday", mic[].Value.Trim().Replace("&nbsp;", ""));
  128. else
  129. dicSznyEmployees[ix]["Birthday"] = mic[].Value.Trim().Replace("&nbsp;", "");
  130. if (!dicSznyEmployees[ix].ContainsKey("Sex"))
  131. dicSznyEmployees[ix].Add("Sex", mic[].Value.Trim().Replace("&nbsp;", ""));
  132. else
  133. dicSznyEmployees[ix]["Sex"] = mic[].Value.Trim().Replace("&nbsp;", "");
  134. if (!dicSznyEmployees[ix].ContainsKey("HomePhone"))
  135. dicSznyEmployees[ix].Add("HomePhone", mic[].Value.Trim().Replace("&nbsp;", ""));
  136. else
  137. dicSznyEmployees[ix]["HomePhone"] = mic[].Value.Trim().Replace("&nbsp;", "");
  138. if (!dicSznyEmployees[ix].ContainsKey("TelPhone"))
  139. dicSznyEmployees[ix].Add("TelPhone", mic[].Value.Trim().Replace("&nbsp;", ""));
  140. else
  141. dicSznyEmployees[ix]["TelPhone"] = mic[].Value.Trim().Replace("&nbsp;", "");
  142. if (!dicSznyEmployees[ix].ContainsKey("Mail"))
  143. dicSznyEmployees[ix].Add("Mail", mic[].Value.Trim().Replace("&nbsp;", ""));
  144. else
  145. dicSznyEmployees[ix]["Mail"] = mic[].Value.Trim().Replace("&nbsp;", "");
  146. if (!dicSznyEmployees[ix].ContainsKey("Address"))
  147. dicSznyEmployees[ix].Add("Address", mic[].Value.Trim().Replace("&nbsp;", ""));
  148. else
  149. dicSznyEmployees[ix]["Address"] = mic[].Value.Trim().Replace("&nbsp;", "");
  150. if (!dicSznyEmployees[ix].ContainsKey("MinZu"))
  151. dicSznyEmployees[ix].Add("MinZu", mic[].Value.Trim().Replace("&nbsp;", ""));
  152. else
  153. dicSznyEmployees[ix]["MinZu"] = mic[].Value.Trim().Replace("&nbsp;", "");
  154. if (!dicSznyEmployees[ix].ContainsKey("AddressJiGuan"))
  155. dicSznyEmployees[ix].Add("AddressJiGuan", mic[].Value.Trim().Replace("&nbsp;", ""));
  156. else
  157. dicSznyEmployees[ix]["AddressJiGuan"] = mic[].Value.Trim().Replace("&nbsp;", "");
  158. if (!dicSznyEmployees[ix].ContainsKey("ZhengZhiMianmao"))
  159. dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[].Value.Trim().Replace("&nbsp;", ""));
  160. else
  161. dicSznyEmployees[ix]["ZhengZhiMianmao"] = mic[].Value.Trim().Replace("&nbsp;", "");
  162. if (!dicSznyEmployees[ix].ContainsKey("Paiqianshijian"))
  163. dicSznyEmployees[ix].Add("Paiqianshijian", mic[].Value.Trim().Replace("&nbsp;", ""));
  164. else
  165. dicSznyEmployees[ix]["Paiqianshijian"] = mic[].Value.Trim().Replace("&nbsp;", "");
  166. if (!dicSznyEmployees[ix].ContainsKey("Remark"))
  167. dicSznyEmployees[ix].Add("Remark", mic[].Value.Trim().Replace("&nbsp;", ""));
  168. else
  169. dicSznyEmployees[ix]["Remark"] = mic[].Value.Trim().Replace("&nbsp;", "");
  170.  
  171. }
  172. dicParms.Clear();
  173. mc = Regex.Matches(msg, @"(?<name>__VIEWSTATE)\|(?<v>[^\|]+)|(?<name>__EVENTVALIDATION)\|(?<v>[^\|]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  174. foreach (Match mi in mc)
  175. {
  176. dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["v"].Value.Trim());
  177. }
  178.  
  179. dicParms.Add("HiddenField_param", "");
  180. dicParms.Add("__EVENTTARGET", "");
  181. dicParms.Add("__EVENTARGUMENT", "");
  182. dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
  183. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString());
  184. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "");
  185. dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "");
  186. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "");
  187. dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "");
  188. dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "");
  189. dicParms.Add("XM", "ZXMCHECK");
  190. lstParms.Clear();
  191. lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", };
  192. lstParms.Add("Button_edit");
  193. lstParms.Add("Button_out");
  194. lstParms.Add("ImageButton_Tx");
  195. lstParms.Add("ImageButton_xx1");
  196. lstParms.Add("Button_qd");
  197.  
  198. mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  199. foreach (Match mi in mc)
  200. {
  201. if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
  202. continue;
  203. if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
  204. dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
  205. else
  206. dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
  207. }
  208. ReqSznyEmployeeInfo(ix, dicParms, SuccessCallback, FailCallback);
  209. }
  210. }
  211. }
  212. catch (Exception ex) {
  213. Business.queueSznyEmployeeInfo.Enqueue(ix);
  214. Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}");
  215. }
  216. }

获取工号

  1.     public static void ReqSznyEmployeeInfo(int ix,Dictionary<string,string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) {
  2. StringBuilder data = new StringBuilder();
  3. foreach (var kv in dicParms)
  4. {
  5. if (kv.Key.StartsWith("header"))
  6. continue;
  7. data.Append($"&{Common.UrlEncode(kv.Key, Encoding.UTF8)}={ Common.UrlEncode(kv.Value, Encoding.UTF8)}");
  8. }
  9. if (data.Length > )
  10. data.Remove(, );
  11. HttpWebRequest req = WebRequest.Create("http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx") as HttpWebRequest;
  12. req.Method = "POST";
  13. req.KeepAlive = true;
  14. req.CookieContainer = sznyCookie;
  15. req.ContentType = "application/x-www-form-urlencoded";
  16. req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
  17. if (dicParms.ContainsKey("ScriptManager1"))
  18. {
  19. req.Headers.Add("X-MicrosoftAjax", "Delta=true");
  20. req.Headers.Add("X-Requested-With", "XMLHttpRequest");
  21. req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
  22. req.Accept = "*/*";
  23. }
  24. req.Headers.Add("Cache-Control", "max-age=0");
  25. req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
  26. req.ServicePoint.ConnectionLimit = int.MaxValue;
  27. req.ServicePoint.Expect100Continue = false;
  28. req.AllowAutoRedirect = true;
  29. req.Credentials = System.Net.CredentialCache.DefaultCredentials;
  30. byte[] buffer = Encoding.UTF8.GetBytes(data.ToString());
  31. using (Stream reqStream = req.GetRequestStream())
  32. {
  33. reqStream.Write(buffer, , buffer.Length);
  34. }
  35. req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeInfo), new object[] { req,ix, dicParms, SuccessCallback, FailCallback });
  36. }
  37.  
  38. private static void RspSznyEmployeeInfo(IAsyncResult result)
  39. {
  40. object[] parms = result.AsyncState as object[];
  41. HttpWebRequest req = parms[] as HttpWebRequest;
  42. int ix =int.Parse( parms[].ToString());
  43. Dictionary<string, string> dicParms = parms[] as Dictionary<string, string>;
  44. Action<string> SuccessCallback = parms[] as Action<string>;
  45. Action<string> FailCallback = parms[] as Action<string>;
  46. try
  47. {
  48. using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse)
  49. {
  50. using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
  51. {
  52. string msg = "";
  53. msg = reader.ReadToEnd();
  54.  
  55. string code = "无";
  56.  
  57. //<input[^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["'][^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["']|<input[^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["'][^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["']
  58. //new Regex(@"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
  59. Match m = Regex.Match(msg, @"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
  60. if (m.Success)
  61. code = m.Groups["code"].Value.Trim();
  62. if (dicSznyEmployees[ix].ContainsKey("Code"))
  63. dicSznyEmployees[ix]["Code"] = code;
  64. else
  65. dicSznyEmployees[ix].Add("Code", code);
  66. queueSuccessEmployeeInfo.Enqueue(ix);
  67. }
  68. }
  69. }
  70. catch (Exception ex) {
  71. Business.queueSznyEmployeeInfo.Enqueue(ix);
  72. Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}");
  73. }
  74. }

入库

采集到的信息,通过定时任务保存到数据库。

  1.         Task.Factory.StartNew(() => {
  2. while (true) {
  3. if (Business.queueSuccessEmployeeInfo.Count <= ) {
  4. Thread.Sleep();
  5. continue;
  6. }
  7. List<Dictionary<string, string>> lst = new List<Dictionary<string, string>>();
  8. while (Business.queueSuccessEmployeeInfo.Count > ) {
  9. Business.queueSuccessEmployeeInfo.TryDequeue(out int ix);
  10. lst.Add(Business.dicSznyEmployees[ix]);
  11. if (lst.Count >= )
  12. break;
  13. }
  14. DbAccess.AddTran(lst, "SznyEmployee",new List<string>() { "UserName", "PersonID" });
  15.  
  16. Thread.Sleep();
  17. }
  18. });

总结

采集的时候,为了能利用已经采集到的信息,而不是重复采集,在采集的时候对数据库数据进行判断是否存在。纯粹是为了提高效率,WebForm的网站真是太慢,太慢了

以前写异步纯粹是为了提高线程效率,在.NET中感觉不到快乐。

终于搞定了,数据已经成功入库了。

.NET的没落也是有原因的,网站的速度的确是慢,.net押宝.net core的新体验了。

我讨厌采集WEBForm网站,写了这么久的爬虫,祈祷永远不要在碰到WEBFORM了。​

秀一下结果 庆祝一下把

爬虫系列 一次采集.NET WebForm网站的坎坷历程的更多相关文章

  1. 爬虫系列:连接网站与解析 HTML

    这篇文章是爬虫系列第三期,讲解使用 Python 连接到网站,并使用 BeautifulSoup 解析 HTML 页面. 在 Python 中我们使用 requests 库来访问目标网站,使用 Bea ...

  2. 爬虫系列3:Requests+Xpath 爬取租房网站信息并保存本地

    数据保存本地 [抓取]:参考前文 爬虫系列1:https://www.cnblogs.com/yizhiamumu/p/9451093.html [分页]:参考前文 爬虫系列2:https://www ...

  3. 爬虫系列2:Requests+Xpath 爬取租房网站信息

    Requests+Xpath 爬取租房网站信息 [抓取]:参考前文 爬虫系列1:https://www.cnblogs.com/yizhiamumu/p/9451093.html [分页]:参考前文 ...

  4. java爬虫系列第二讲-爬取最新动作电影《海王》迅雷下载地址

    1. 目标 使用webmagic爬取动作电影列表信息 爬取电影<海王>详细信息[电影名称.电影迅雷下载地址列表] 2. 爬取最新动作片列表 获取电影列表页面数据来源地址 访问http:// ...

  5. 爬虫系列:存储 CSV 文件

    上一期:爬虫系列:存储媒体文件,讲解了如果通过爬虫下载媒体文件,以及下载媒体文件相关代码讲解. 本期将讲解如果将数据保存到 CSV 文件. 逗号分隔值(Comma-Separated Values,C ...

  6. java爬虫系列第一讲-爬虫入门

    1. 概述 java爬虫系列包含哪些内容? java爬虫框架webmgic入门 使用webmgic爬取 http://ady01.com 中的电影资源(动作电影列表页.电影下载地址等信息) 使用web ...

  7. java爬虫系列目录

    1. java爬虫系列第一讲-爬虫入门(爬取动作片列表) 2. java爬虫系列第二讲-爬取最新动作电影<海王>迅雷下载地址 3. java爬虫系列第三讲-获取页面中绝对路径的各种方法 4 ...

  8. 爬虫系列1:Requests+Xpath 爬取豆瓣电影TOP

    爬虫1:Requests+Xpath 爬取豆瓣电影TOP [抓取]:参考前文 爬虫系列1:https://www.cnblogs.com/yizhiamumu/p/9451093.html [分页]: ...

  9. python 全栈开发,Day134(爬虫系列之第1章-requests模块)

    一.爬虫系列之第1章-requests模块 爬虫简介 概述 近年来,随着网络应用的逐渐扩展和深入,如何高效的获取网上数据成为了无数公司和个人的追求,在大数据时代,谁掌握了更多的数据,谁就可以获得更高的 ...

随机推荐

  1. AI vs PS 矢量 VS 位图

    矢量图 AI最大可以放大64000%.不会失真,依然很清晰.原理是不同的点以及点与点之间的路径构成的,不论放大的多大,点在路径在,就可以精确的计算出它的区域.AI中无法直接编辑位图. 位图 代表PS, ...

  2. Jmeter 使用正则表达式提取响应结果中的值

    正则表达式提取的界面如下图: apply to: Main sample and sub-samples:作用于父节点取样器及对应子节点取样器Main sample only:仅作用于父节点取样器Su ...

  3. Thymeleaf+SpringBoot+Mybatis实现的齐贤易游网旅游信息管理系统

    项目简介 项目来源于:https://github.com/liuyongfei-1998/root 本系统是基于Thymeleaf+SpringBoot+Mybatis.是非常标准的SSM三大框架( ...

  4. java 递归及其经典应用--求阶乘、打印文件信息、计算斐波那契数列

    什么是递归 我先看下百度百科的解释: 一种计算过程,如果其中每一步都要用到前一步或前几步的结果,称为递归的.用递归过程定义的函数,称为递归函数,例如连加.连乘及阶乘等.凡是递归的函数,都是可计算的,即 ...

  5. php中session_id()函数详细介绍,会话id生成过程及session id长度

    php中session_id()函数原型及说明session_id()函数说明:stringsession_id([string$id])session_id() 可以用来获取/设置 当前会话 ID. ...

  6. Linux 如何通过 iscsi target name 获取 ip

    by Mike Andrews # lsscsi -t [:::] disk iqn.-.com.blockbridge:t-pjxfzufjkp-illoghjk,t,0x1 /dev/sda [: ...

  7. Redis持久化存储(一)

    Redis介绍 Redis 是完全开源免费的,遵守BSD协议,是一个高性能的key-value数据库. Redis 与其他 key - value 缓存产品有以下三个特点: Redis支持数据的持久化 ...

  8. 设计模式 - 迭代器模式详解及其在ArrayList中的应用

    基本介绍 迭代器模式(Iterator Pattern)是 Java 中使用最多的一种模式,它可以顺序的访问容器中的元素,但不需要知道容器的内部细节 模式结构 Iterator(抽象迭代器):定义遍历 ...

  9. js 之 JSON详解

    JSON:JavaScriptObjectNotation JSON是一种语法,用来序列化对象.数组.字符串.布尔值和null. JSON是基于JavaScript的语法,但与之不同 注意事项 JSO ...

  10. c++ 如何开N次方?速解

    c++ 如何开N次方?速解   直接上代码 #include <iostream> #include <cmath> using namespace std; typedef ...