

  1. 前端页面是用JS模板引擎生成的
  2. 接口主要是用POST提交参数的




  1. http://www.ibm.com/developerworks/aix/library/au-threadingpython/
  2. http://stackoverflow.com/questions/10525185/python-threading-how-do-i-lock-a-thread





1 URL队列和结果队列



  1. from Queue import Queue
  2. urls_queue = Queue()
  3. out_queue = Queue()

2 请求线程


  1. import threading
  2. class ThreadCrawl(threading.Thread):
  3. def __init__(self, queue, out_queue):
  4. threading.Thread.__init__(self)
  5. self.queue = queue
  6. self.out_queue = out_queue
  7. def run(self):
  8. while True:
  9. item = self.queue.get()
  10. self.queue.task_down()


Queue.get([block[, timeout]])

Remove and return an item from the queue. If optional args block is true and timeout is None (the default), block if necessary until an item is available.


Indicate that a formerly enqueued task is complete. Used by queue consumer threads. For each get() used to fetch a task, a subsequent call to task_done() tells the queue that the processing on the task is complete.




  1. lock = threading.Lock()
  2. f = codecs.open('out.txt', 'w', 'utf8')


  1. with lock:
  2. f.write(something)






  1. # coding: utf-8
  2. '''
  3. Author mr_zys
  4. Email myzysv5@sina.com
  5. '''
  6. from Queue import Queue
  7. import threading
  8. import urllib2
  9. import time
  10. import json
  11. import codecs
  12. from bs4 import BeautifulSoup
  13. urls_queue = Queue()
  14. data_queue = Queue()
  15. lock = threading.Lock()
  16. f = codecs.open('out.txt', 'w', 'utf8')
  17. class ThreadUrl(threading.Thread):
  18. def __init__(self, queue):
  19. threading.Thread.__init__(self)
  20. self.queue = queue
  21. def run(self):
  22. pass
  23. class ThreadCrawl(threading.Thread):
  24. def __init__(self, url, queue, out_queue):
  25. threading.Thread.__init__(self)
  26. self.url = url
  27. self.queue = queue
  28. self.out_queue = out_queue
  29. def run(self):
  30. while True:
  31. item = self.queue.get()
  32. data = self._data_post(item)
  33. try:
  34. req = urllib2.Request(url=self.url, data=data)
  35. res = urllib2.urlopen(req)
  36. except urllib2.HTTPError, e:
  37. raise e.reason
  38. py_data = json.loads(res.read())
  39. res.close()
  40. item['first'] = 'false'
  41. item['pn'] = item['pn'] + 1
  42. success = py_data['success']
  43. if success:
  44. print 'Get success...'
  45. else:
  46. print 'Get fail....'
  47. print 'pn is : %s' % item['pn']
  48. result = py_data['content']['result']
  49. if len(result) != 0:
  50. self.queue.put(item)
  51. print 'now queue size is: %d' % self.queue.qsize()
  52. self.out_queue.put(py_data['content']['result'])
  53. self.queue.task_done()
  54. def _data_post(self, item):
  55. pn = item['pn']
  56. first = 'false'
  57. if pn == 1:
  58. first = 'true'
  59. return 'first=' + first + '&pn=' + str(pn) + '&kd=' + item['kd']
  60. def _item_queue(self):
  61. pass
  62. class ThreadWrite(threading.Thread):
  63. def __init__(self, queue, lock, f):
  64. threading.Thread.__init__(self)
  65. self.queue = queue
  66. self.lock = lock
  67. self.f = f
  68. def run(self):
  69. while True:
  70. item = self.queue.get()
  71. self._parse_data(item)
  72. self.queue.task_done()
  73. def _parse_data(self, item):
  74. for i in item:
  75. l = self._item_to_str(i)
  76. with self.lock:
  77. print 'write %s' % l
  78. self.f.write(l)
  79. def _item_to_str(self, item):
  80. positionName = item['positionName']
  81. positionType = item['positionType']
  82. workYear = item['workYear']
  83. education = item['education']
  84. jobNature = item['jobNature']
  85. companyName = item['companyName']
  86. companyLogo = item['companyLogo']
  87. industryField = item['industryField']
  88. financeStage = item['financeStage']
  89. companyShortName = item['companyShortName']
  90. city = item['city']
  91. salary = item['salary']
  92. positionFirstType = item['positionFirstType']
  93. createTime = item['createTime']
  94. positionId = item['positionId']
  95. return positionName + ' ' + positionType + ' ' + workYear + ' ' + education + ' ' + \
  96. jobNature + ' ' + companyLogo + ' ' + industryField + ' ' + financeStage + ' ' + \
  97. companyShortName + ' ' + city + ' ' + salary + ' ' + positionFirstType + ' ' + \
  98. createTime + ' ' + str(positionId) + '\n'
  99. def main():
  100. for i in range(4):
  101. t = ThreadCrawl(
  102. 'http://www.lagou.com/jobs/positionAjax.json', urls_queue, data_queue)
  103. t.setDaemon(True)
  104. t.start()
  105. datas = [
  106. {'first': 'true', 'pn': 1, 'kd': 'Java'}
  107. #{'first': 'true', 'pn': 1, 'kd': 'Python'}
  108. ]
  109. for d in datas:
  110. urls_queue.put(d)
  111. for i in range(4):
  112. t = ThreadWrite(data_queue, lock, f)
  113. t.setDaemon(True)
  114. t.start()
  115. urls_queue.join()
  116. data_queue.join()
  117. with lock:
  118. f.close()
  119. print 'data_queue siez: %d' % data_queue.qsize()
  120. main()




