1. # version 1.0
    def connect_mysql(sql, oper_type="select", data_l=None):
  2. conn = pymysql.connect(host='localhost', user="root", password="",
  3. database="work", port=3306)
  4. cur = conn.cursor()
  5. if oper_type == "insert":
  6. cur.executemany(sql, data_l)
  7. conn.commit()
  8. else:
  9. cur.execute(sql)
  10. result = cur.fetchall()
  11. # print(type(result), "result")
  12. conn.close()
  13. return result
  14.  
  15. def process_jobs(field_name):
  16. sql = "select j." + field_name + " FROM personal_jobs j"
  17. column_name = connect_mysql(sql, oper_type="select")
  18. row_total = (len(column_name))
  19. row_category = set(column_name)
  20.  
  21. # init category dict
  22. category_dict = {}
  23. for k in row_category:
  24. category_dict[k] = 0
  25.  
  26. # calculate amount
  27. cal_nmu = 0
  28. for k in row_category:
  29. for r in column_name:
  30. if r == k:
  31. cal_nmu += 1
  32. category_dict[k] = cal_nmu
  33. cal_nmu = 0
  34. print(type(category_dict.items()), category_dict.items())
  35. print(row_total, len(category_dict.items()))
  36. return row_total, category_dict
  37.  
  38. process_jobs("job_salary")
  1. version 1.1
    def count_times(all_list):
  2. ls = []
  3. item_list = list(set(all_list))
  4. for m in item_list:
  5. c = all_list.count(m)
  6. ls.append([m, c])
  7. return sorted(ls)
  8.  
  9. def process_salary(field_name):
  10. # sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
  11. sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
  12. original_sal = connect_mysql(sql)
  13. # sort salary order
  14. row_category = list(set(original_sal))
  15. general_min, general_avg, general_max = [], [], []
  16. # cal_num = 0
  17. for sal in row_category:
  18. # calculate category amount
  19. # for cat in column_name:
  20. # if cat == sal:
  21. # cal_num += 1
  22. # process salary
  23. if field_name == "job_salary":
  24. sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
  25. general_min.append(int(sal_tmp[0]))
  26. general_max.append(int(sal_tmp[1]))
  27.  
  28. # process experience
  29. if field_name == "job_exp":
  30. print(original_sal)
  31.  
  32. # initial again
  33. # cal_num = 0
  34.  
  35. # calculate min sal
  36. min_sal = count_times(general_min)
  37. for m1 in min_sal:
  38. min_s = str(m1[0]) + "K"
  39. m1[0] = min_s
  40.  
  41. # calculate max sal
  42. max_sal = count_times(general_max)
  43. for m2 in max_sal:
  44. min_s = str(m2[0]) + "K"
  45. m2[0] = min_s
  46.  
  47. # calculate avg sal
  48. avg_sal = count_times(original_sal)
  49. print("original: ", avg_sal)
  50. for a1 in avg_sal:
  51. sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
  52. a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
  53. avg_sal = sorted(avg_sal)
  54.  
  55. for a2 in avg_sal:
  56. a2[0] = str(a2[0]) + "K"
  57. # debug
  58. print(len(min_sal), min_sal)
  59. print(len(avg_sal), avg_sal)
  60. print(len(max_sal), max_sal)
  61. return min_sal, avg_sal, max_sal
  62.  
  63. # process_salary("job_salary")
  1. import jieba
  2. from wordcloud import WordCloud
  3. import matplotlib.pyplot as plt
  4. from collections import Counter
  5. from scipy.misc import imread
  6.  
  7. def process_reqirement(field_name):
  8. sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
  9. original_req = connect_mysql(sql)
  10. userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"]
  11.  
  12. jieba.load_userdict(userdict)
  13. # print(type(original_req), str(original_req))
  14. text0 = Counter(jieba.cut(str(original_req)))
  15. text1 = " ".join(jieba.cut(str(original_req)))
  16. [item for item in sorted(text0.values())]
  17. # print(text0.keys(), text0.values())
  18. # print(type(text0), text0)
  19.  
  20. # # create word cloud
  21. # wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
  22. # background_color="white", mask=imread("china.jpg")).generate(text1)
  23. # plt.imshow(wordcloud)
  24. # plt.axis("off")
  25. # plt.show()
  26.  
  27. # find requirement item what we really need
  28. req_list = []
  29. # print(len(text0.keys()), text0)
  30. for k, v in text0.items():
  31. for kk, vv in text0.items():
  32. if str(k).lower() == str(kk).lower():
  33. # print(k, v)
  34. req_list.append([k, (v + vv)])
  35. # print(k, v)
  36. break
  37. print(len(req_list), req_list)
  38.  
  39. for t in userdict:
  40. for k, v in text0.items():
  41. if t.lower() == str(k).lower():
  42. req_list.append([t, v])
  43. break
  44. # print(req_list)
  45. return req_list
  46. process_reqirement("job_requirement")
  1. def user_defined(file_name):
  2. user_list = []
  3. with open(file_name, "r", encoding="utf8") as f:
  4. for i in f:
  5. user_list.append(i.strip())
  6. return user_list
  7.  
  8. def process_company(field_name):
  9. sql = "select " + field_name + " from work.personal_jobs"
  10. company = [list(i) for i in connect_mysql(sql)]
  11. user_list = user_defined("t.txt")
  12. user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
  13. jieba.load_userdict(user_list)
  14. me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
  15. req_list, suit_list = [], []
  16. for req in company:
  17. req_dict = Counter(jieba.cut(req[1]))
  18. req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
  19. for r in req_list:
  20. if len(r[1]) > 0:
  21. # print(r[1])
  22. own = [item for item in me_list if item in r[1]]
  23. if len(own) > 0:
  24. suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
  25. return sorted(suit_list, key=lambda x: x[1])
  26. # print(sorted(suit_list, key=lambda x: x[1]))
  27. process_company("company_name, job_requirement")

process data的更多相关文章

  1. 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)

    不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...

  2. [CDH] Process data: integrate Spark with Spring Boot

    c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...

  3. Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices

    January 22, 2019Use Cases, Apache Flink Lasse Nedergaard     Recently there has been significant dis ...

  4. [AJAX系列]$.post(url,[data],[fn],[type])

    概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式  xml  html ...

  5. Data Science at the Command Line学习笔记(二)

    1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...

  6. [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.

    3.3  Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...

  7. Learn know more about big data

    As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...

  8. Monitoring and Tuning the Linux Networking Stack: Receiving Data

    http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...

  9. Big Data Analytics for Security(Big Data Analytics for Security Intelligence)

    http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...

随机推荐

  1. django上课笔记6-MVC,MTV架构-中间件-初识Form组件

    一.MVC,MTV架构 models(数据库,模型) views(html模板) controllers(业务逻辑处理) --> MVC models(数据库,模型) templates(htm ...

  2. hadoop中的序列化

    此文已由作者肖凡授权网易云社区发布. 欢迎访问网易云社区,了解更多网易技术产品运营经验. 最近在学习hadoop,发现hadoop的序列化过程和jdk的序列化有很大的区别,下面就来说说这两者的区别都有 ...

  3. poj1312dfs基础

    就是很简单的DFS-因为数据偏小,上去就是干了 #include <stdio.h> #include <string.h> #include <math.h> # ...

  4. poj 1743 Musical Theme【二分+SA】

    差分,然后二分长度mid,判断是把height按照min不小于mid分组,取最大最小的sa位置看是否>=mid即可,注意差分后最后答案要+1 #include<iostream> # ...

  5. 第二篇 .NET高级技术之密闭类和静态类及扩展方法

    1.密闭类是修饰为sealed的类, sealed不能有子类.一般只有系统中的一些基本类声明为sealed.面试题:是否可以编写一个类继承自String类? 答:不能,因为string被声明为了sea ...

  6. tyvj 1391 走廊泼水节【最小生成树】By cellur925

    题目传送门 题意简化:给你一棵树,要求你加边使它成为完全图(任意两点间均有一边相连) ,满足原来的树是这个图的最小生成树.求加边的价值最小是多少. 考虑Kruskal的过程,我们每次找一条最短的,两边 ...

  7. Codeforces Round #513解题报告(A~E)By cellur925

    我是比赛地址 A:Phone Numbers $Description$:给你一串数字,问你能组成多少开头为8的11位电话号码. $Sol$:统计8的数量,与$n$%11作比较. #include&l ...

  8. Springboot配置类

    配置类 MyAppConfig  import com.test.springboot.service.HelloService; import org.springframework.context ...

  9. 递归(Recursion)

    递归是一种非常常用的算法,分为“递”和“归”两个步骤.满足递归算法有三个条件:1.一个问题,可以分解为子问题:2.该问题,与分解后的子问题,解决思路一致:3.存在终止条件.案例演示:假设有n个台阶,每 ...

  10. POJ 1151 Atlantis(扫描线)

    题目原链接:http://poj.org/problem?id=1151 题目中文翻译: POJ 1151 Atlantis Time Limit: 1000MS   Memory Limit: 10 ...