0. 写在前面









1. 爬虫思路


  1. 模拟登录HDU
  2. 针对某一道题目
    • 搜索AC代码

      • 通过正则表达式进行代码的提取
      • 通过htmlparser进行代码的处理
    • 提交
      • 若AC,返回2
      • 否则,继续提交代码(这里最多只提交10份代码)
      • 10次提交后还未AC,放弃此题

2. 简单粗暴的代码

  1. #coding='utf-8'
  2. import requests, re, os, HTMLParser, time, getpass
  3. host_url = 'http://acm.hdu.edu.cn'
  4. post_url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'
  5. sub_url = 'http://acm.hdu.edu.cn/submit.php?action=submit'
  6. csdn_url = 'http://so.csdn.net/so/search/s.do'
  7. head = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' }
  8. html_parser = HTMLParser.HTMLParser()
  9. s = requests.session()
  10. def login(usr,psw):
  11. s.get(host_url);
  12. data = {'username':usr,'userpass':psw,'login':'Sign In'}
  13. r = s.post(post_url,data=data)
  14. def check_lan(lan):
  15. if 'java' in lan:
  16. return '5'
  17. return '0'
  18. def parser_code(code):
  19. return html_parser.unescape(code).encode('utf-8')
  20. def is_ac(pid,usr):
  21. tmp = requests.get('http://acm.hdu.edu.cn/userstatus.php?user='+usr).text
  22. accept = re.search('List of solved problems</font></h3>.*?<p align=left><script language=javascript>(.*?)</script><br></p>',tmp,re.S)
  23. if pid in accept.group(1):
  24. print '%s was solved' %pid
  25. return True
  26. else:
  27. return False
  28. def search_csdn(PID,usr):
  29. get_data = { 'q':'HDU ' + PID, 't':'blog', 'o':'', 's':'', 'l':'null' }
  30. search_html = requests.get(csdn_url,params=get_data).text
  31. linklist = re.findall('<dd class="search-link"><a href="(.*?)" target="_blank">',search_html,re.S)
  32. for l in linklist:
  33. print l
  34. tm_html = requests.get(l,headers=head).text;
  35. title = re.search('<title>(.*?)</title>',tm_html,re.S).group(1).lower()
  36. if PID not in title:
  37. continue
  38. if 'hdu' not in title:
  39. continue
  40. tmp = re.search('name="code" class="(.*?)">(.*?)</pre>',tm_html,re.S)
  41. if tmp == None:
  42. print 'code not find'
  43. continue
  44. LAN = check_lan(tmp.group(1))
  45. CODE = parser_code(tmp.group(2))
  46. if r'include' in CODE:
  47. pass
  48. elif r'import java' in CODE:
  49. pass
  50. else:
  51. continue
  52. print PID, LAN
  53. print '--------------'
  54. submit_data = { 'check':'0', 'problemid':PID, 'language':LAN, 'usercode':CODE }
  55. s.post(sub_url,headers=head,data=submit_data)
  56. time.sleep(5)
  57. if is_ac(PID,usr):
  58. break
  59. if __name__ == '__main__':
  60. usr = raw_input('input your username:')
  61. psw = getpass.getpass('input your password:')
  62. login(usr,psw)
  63. pro_cnt = 1000
  64. while pro_cnt <= 5679:
  65. PID = str(pro_cnt)
  66. if is_ac(PID,usr):
  67. pro_cnt += 1
  68. continue
  69. search_csdn(PID,usr)
  70. pro_cnt += 1





  1. #include <cstdio>
  2. #include <cstring>
  3. #include <algorithm>
  4. #include <queue>
  5. using namespace std;
  6. #define clr( a, b ) memset( a, b, sizeof(a) )
  7. const int SIGMA_SIZE = 26;
  8. const int NODE_SIZE = 500000 + 10;
  9. struct ac_automaton{
  10. int ch[ NODE_SIZE ][ SIGMA_SIZE ];
  11. int f[ NODE_SIZE ], val[ NODE_SIZE ], last[ NODE_SIZE ];
  12. int sz;
  13. void init(){
  14. sz = 1;
  15. clr( ch[0], 0 ), clr( val, 0 );
  16. }
  17. void insert( char *s ){
  18. int u = 0, i = 0;
  19. for( ; s[i]; ++i ){
  20. int c = s[i] - 'a';
  21. if( !ch[u][c] ){
  22. clr( ch[sz], 0 );
  23. val[sz] = 0;
  24. ch[u][c] = sz++;
  25. }
  26. u = ch[u][c];
  27. }
  28. val[u]++;
  29. }
  30. void getfail(){
  31. queue<int> q;
  32. f[0] = 0;
  33. for( int c = 0; c < SIGMA_SIZE; ++c ){
  34. int u = ch[0][c];
  35. if( u ) f[u] = 0, q.push(u), last[u] = 0;
  36. }
  37. while( !q.empty() ){
  38. int r = q.front(); q.pop();
  39. for( int c = 0; c < SIGMA_SIZE; ++c ){
  40. int u = ch[r][c];
  41. if( !u ){
  42. ch[r][c] = ch[ f[r] ][c];
  43. continue;
  44. }
  45. q.push( u );
  46. int v = f[r];
  47. while( v && !ch[v][c] ) v = f[v];
  48. f[u] = ch[v][c];
  49. last[u] = val[ f[u] ] ? f[u] : last[ f[u] ];
  50. }
  51. }
  52. }
  53. int work( char* s ){
  54. int res = 0;
  55. int u = 0, i = 0, e;
  56. for( ; s[i]; ++i ){
  57. int c = s[i] - 'a';
  58. u = ch[u][c];
  59. e = u;
  60. while( val[e] ){
  61. res += val[e];
  62. val[e] = 0;
  63. e = last[e];
  64. }
  65. }
  66. return res;
  67. }
  68. }ac;

