# version 1.0
def connect_mysql(sql, oper_type="select", data_l=None):
conn = pymysql.connect(host='localhost', user="root", password="",
database="work", port=3306)
cur = conn.cursor()
if oper_type == "insert":
cur.executemany(sql, data_l)
conn.commit()
else:
cur.execute(sql)
result = cur.fetchall()
# print(type(result), "result")
conn.close()
return result def process_jobs(field_name):
sql = "select j." + field_name + " FROM personal_jobs j"
column_name = connect_mysql(sql, oper_type="select")
row_total = (len(column_name))
row_category = set(column_name) # init category dict
category_dict = {}
for k in row_category:
category_dict[k] = 0 # calculate amount
cal_nmu = 0
for k in row_category:
for r in column_name:
if r == k:
cal_nmu += 1
category_dict[k] = cal_nmu
cal_nmu = 0
print(type(category_dict.items()), category_dict.items())
print(row_total, len(category_dict.items()))
return row_total, category_dict process_jobs("job_salary")
version 1.1
def count_times(all_list):
ls = []
item_list = list(set(all_list))
for m in item_list:
c = all_list.count(m)
ls.append([m, c])
return sorted(ls) def process_salary(field_name):
# sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_sal = connect_mysql(sql)
# sort salary order
row_category = list(set(original_sal))
general_min, general_avg, general_max = [], [], []
# cal_num = 0
for sal in row_category:
# calculate category amount
# for cat in column_name:
# if cat == sal:
# cal_num += 1
# process salary
if field_name == "job_salary":
sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
general_min.append(int(sal_tmp[0]))
general_max.append(int(sal_tmp[1])) # process experience
if field_name == "job_exp":
print(original_sal) # initial again
# cal_num = 0 # calculate min sal
min_sal = count_times(general_min)
for m1 in min_sal:
min_s = str(m1[0]) + "K"
m1[0] = min_s # calculate max sal
max_sal = count_times(general_max)
for m2 in max_sal:
min_s = str(m2[0]) + "K"
m2[0] = min_s # calculate avg sal
avg_sal = count_times(original_sal)
print("original: ", avg_sal)
for a1 in avg_sal:
sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
avg_sal = sorted(avg_sal) for a2 in avg_sal:
a2[0] = str(a2[0]) + "K"
# debug
print(len(min_sal), min_sal)
print(len(avg_sal), avg_sal)
print(len(max_sal), max_sal)
return min_sal, avg_sal, max_sal # process_salary("job_salary")
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from scipy.misc import imread def process_reqirement(field_name):
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_req = connect_mysql(sql)
userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"] jieba.load_userdict(userdict)
# print(type(original_req), str(original_req))
text0 = Counter(jieba.cut(str(original_req)))
text1 = " ".join(jieba.cut(str(original_req)))
[item for item in sorted(text0.values())]
# print(text0.keys(), text0.values())
# print(type(text0), text0) # # create word cloud
# wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
# background_color="white", mask=imread("china.jpg")).generate(text1)
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show() # find requirement item what we really need
req_list = []
# print(len(text0.keys()), text0)
for k, v in text0.items():
for kk, vv in text0.items():
if str(k).lower() == str(kk).lower():
# print(k, v)
req_list.append([k, (v + vv)])
# print(k, v)
break
print(len(req_list), req_list) for t in userdict:
for k, v in text0.items():
if t.lower() == str(k).lower():
req_list.append([t, v])
break
# print(req_list)
return req_list
process_reqirement("job_requirement")
def user_defined(file_name):
user_list = []
with open(file_name, "r", encoding="utf8") as f:
for i in f:
user_list.append(i.strip())
return user_list def process_company(field_name):
sql = "select " + field_name + " from work.personal_jobs"
company = [list(i) for i in connect_mysql(sql)]
user_list = user_defined("t.txt")
user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
jieba.load_userdict(user_list)
me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
req_list, suit_list = [], []
for req in company:
req_dict = Counter(jieba.cut(req[1]))
req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
for r in req_list:
if len(r[1]) > 0:
# print(r[1])
own = [item for item in me_list if item in r[1]]
if len(own) > 0:
suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
return sorted(suit_list, key=lambda x: x[1])
# print(sorted(suit_list, key=lambda x: x[1]))
process_company("company_name, job_requirement")

process data的更多相关文章

  1. 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)

    不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...

  2. [CDH] Process data: integrate Spark with Spring Boot

    c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...

  3. Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices

    January 22, 2019Use Cases, Apache Flink Lasse Nedergaard     Recently there has been significant dis ...

  4. [AJAX系列]$.post(url,[data],[fn],[type])

    概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式  xml  html ...

  5. Data Science at the Command Line学习笔记(二)

    1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...

  6. [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.

    3.3  Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...

  7. Learn know more about big data

    As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...

  8. Monitoring and Tuning the Linux Networking Stack: Receiving Data

    http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...

  9. Big Data Analytics for Security(Big Data Analytics for Security Intelligence)

    http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...

随机推荐

  1. 看鸟哥的Linux私房菜的一些命令自我总结(二)

    -关于执行文件路径的变量  $PATH -查看文件与目录  ls -a  :全部的文件,连同隐藏文件一起列出来 -d  :仅列出目录本身,而不是列出目录内的文件数据 -i   :列出inode号码 - ...

  2. React 从入门到进阶之路(八)

    之前的文章我们介绍了 React中的组件.父子组件.React props父组件给子组件传值.子组件给父组件传值.父组件中通过refs获取子组件属性和方法.接下来我们将介绍 React propTyp ...

  3. POJ3468【线段树lazy操作】

    上午理论AC,打到现在快吐了... 一个那么**Lazy操作打成这样,query操作和update操作都有问题,妈蛋,发现是mid<=s+1-真是蠢到家,明明是mid+1<=s卧槽连左和右 ...

  4. poj1664【DFS】

    思路:搜一下,还想多了,记得以前做过把一个数搞成几个数的相加组合,然后这题无非就是多了个组合的个数<=m的,那么只要多加一个条件,当num>m的时候也return掉就好了. //#incl ...

  5. python 可迭代对象与迭代器之间的转换

    列表: >>> l = [1, 2, 3, 4] >>> l_iter = iter(l) >>> l_iter <list_iterato ...

  6. LuoguP1268树的重量【构造/思维】By cellur925

    题目传送门 Description 给你一个矩阵$M$,$M(i,j)$表示$i$到$j$的最短距离.定义树的重量为树上各边权之和,对于任意给出的合法矩阵$M$,已知它所能表示树的重量是唯一确定的.给 ...

  7. Hdu 5442 Favorite Donut (2015 ACM/ICPC Asia Regional Changchun Online 最大最小表示法 + KMP)

    题目链接: Hdu 5442 Favorite Donut 题目描述: 给出一个文本串,找出顺时针或者逆时针循环旋转后,字典序最大的那个字符串,字典序最大的字符串如果有多个,就输出下标最小的那个,如果 ...

  8. UvaLive6442(思维、结论)

    结论是:按位置排序好以后,对于真正的答案,走法应该是:依次走向第0个等分点,第1个等分点……这样对于这种等分情况,是最优的调度. /* 先假设一个终点位置然后按位站好 这个位置不一定是最优所以要调 调 ...

  9. UVa 12186 Another Crisis 工人的请愿书

    c表示某上司上报的最少请愿下属,k表示总下属c=0.01T*k=kT/100(0.01T*k是整数)c=[0.01T*k]+1=[kT/100]+1(0.01T*k不是整数) kT=100 c=1 k ...

  10. 线段树(单点更新) HDOJ 4288 Coder

    题目传送门 #include <cstdio> #include <cstring> #define lson l, m, rt << 1 #define rson ...