process data
# version 1.0
def connect_mysql(sql, oper_type="select", data_l=None):
conn = pymysql.connect(host='localhost', user="root", password="",
database="work", port=3306)
cur = conn.cursor()
if oper_type == "insert":
cur.executemany(sql, data_l)
conn.commit()
else:
cur.execute(sql)
result = cur.fetchall()
# print(type(result), "result")
conn.close()
return result def process_jobs(field_name):
sql = "select j." + field_name + " FROM personal_jobs j"
column_name = connect_mysql(sql, oper_type="select")
row_total = (len(column_name))
row_category = set(column_name) # init category dict
category_dict = {}
for k in row_category:
category_dict[k] = 0 # calculate amount
cal_nmu = 0
for k in row_category:
for r in column_name:
if r == k:
cal_nmu += 1
category_dict[k] = cal_nmu
cal_nmu = 0
print(type(category_dict.items()), category_dict.items())
print(row_total, len(category_dict.items()))
return row_total, category_dict process_jobs("job_salary")
version 1.1
def count_times(all_list):
ls = []
item_list = list(set(all_list))
for m in item_list:
c = all_list.count(m)
ls.append([m, c])
return sorted(ls) def process_salary(field_name):
# sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_sal = connect_mysql(sql)
# sort salary order
row_category = list(set(original_sal))
general_min, general_avg, general_max = [], [], []
# cal_num = 0
for sal in row_category:
# calculate category amount
# for cat in column_name:
# if cat == sal:
# cal_num += 1
# process salary
if field_name == "job_salary":
sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
general_min.append(int(sal_tmp[0]))
general_max.append(int(sal_tmp[1])) # process experience
if field_name == "job_exp":
print(original_sal) # initial again
# cal_num = 0 # calculate min sal
min_sal = count_times(general_min)
for m1 in min_sal:
min_s = str(m1[0]) + "K"
m1[0] = min_s # calculate max sal
max_sal = count_times(general_max)
for m2 in max_sal:
min_s = str(m2[0]) + "K"
m2[0] = min_s # calculate avg sal
avg_sal = count_times(original_sal)
print("original: ", avg_sal)
for a1 in avg_sal:
sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
avg_sal = sorted(avg_sal) for a2 in avg_sal:
a2[0] = str(a2[0]) + "K"
# debug
print(len(min_sal), min_sal)
print(len(avg_sal), avg_sal)
print(len(max_sal), max_sal)
return min_sal, avg_sal, max_sal # process_salary("job_salary")
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from scipy.misc import imread def process_reqirement(field_name):
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_req = connect_mysql(sql)
userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"] jieba.load_userdict(userdict)
# print(type(original_req), str(original_req))
text0 = Counter(jieba.cut(str(original_req)))
text1 = " ".join(jieba.cut(str(original_req)))
[item for item in sorted(text0.values())]
# print(text0.keys(), text0.values())
# print(type(text0), text0) # # create word cloud
# wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
# background_color="white", mask=imread("china.jpg")).generate(text1)
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show() # find requirement item what we really need
req_list = []
# print(len(text0.keys()), text0)
for k, v in text0.items():
for kk, vv in text0.items():
if str(k).lower() == str(kk).lower():
# print(k, v)
req_list.append([k, (v + vv)])
# print(k, v)
break
print(len(req_list), req_list) for t in userdict:
for k, v in text0.items():
if t.lower() == str(k).lower():
req_list.append([t, v])
break
# print(req_list)
return req_list
process_reqirement("job_requirement")
def user_defined(file_name):
user_list = []
with open(file_name, "r", encoding="utf8") as f:
for i in f:
user_list.append(i.strip())
return user_list def process_company(field_name):
sql = "select " + field_name + " from work.personal_jobs"
company = [list(i) for i in connect_mysql(sql)]
user_list = user_defined("t.txt")
user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
jieba.load_userdict(user_list)
me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
req_list, suit_list = [], []
for req in company:
req_dict = Counter(jieba.cut(req[1]))
req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
for r in req_list:
if len(r[1]) > 0:
# print(r[1])
own = [item for item in me_list if item in r[1]]
if len(own) > 0:
suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
return sorted(suit_list, key=lambda x: x[1])
# print(sorted(suit_list, key=lambda x: x[1]))
process_company("company_name, job_requirement")
process data的更多相关文章
- 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)
不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...
- [CDH] Process data: integrate Spark with Spring Boot
c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...
- Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices
January 22, 2019Use Cases, Apache Flink Lasse Nedergaard Recently there has been significant dis ...
- [AJAX系列]$.post(url,[data],[fn],[type])
概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式 xml html ...
- Data Science at the Command Line学习笔记(二)
1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...
- [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.
3.3 Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...
- Learn know more about big data
As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...
- Monitoring and Tuning the Linux Networking Stack: Receiving Data
http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...
- Big Data Analytics for Security(Big Data Analytics for Security Intelligence)
http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...
随机推荐
- 看鸟哥的Linux私房菜的一些命令自我总结(二)
-关于执行文件路径的变量 $PATH -查看文件与目录 ls -a :全部的文件,连同隐藏文件一起列出来 -d :仅列出目录本身,而不是列出目录内的文件数据 -i :列出inode号码 - ...
- React 从入门到进阶之路(八)
之前的文章我们介绍了 React中的组件.父子组件.React props父组件给子组件传值.子组件给父组件传值.父组件中通过refs获取子组件属性和方法.接下来我们将介绍 React propTyp ...
- POJ3468【线段树lazy操作】
上午理论AC,打到现在快吐了... 一个那么**Lazy操作打成这样,query操作和update操作都有问题,妈蛋,发现是mid<=s+1-真是蠢到家,明明是mid+1<=s卧槽连左和右 ...
- poj1664【DFS】
思路:搜一下,还想多了,记得以前做过把一个数搞成几个数的相加组合,然后这题无非就是多了个组合的个数<=m的,那么只要多加一个条件,当num>m的时候也return掉就好了. //#incl ...
- python 可迭代对象与迭代器之间的转换
列表: >>> l = [1, 2, 3, 4] >>> l_iter = iter(l) >>> l_iter <list_iterato ...
- LuoguP1268树的重量【构造/思维】By cellur925
题目传送门 Description 给你一个矩阵$M$,$M(i,j)$表示$i$到$j$的最短距离.定义树的重量为树上各边权之和,对于任意给出的合法矩阵$M$,已知它所能表示树的重量是唯一确定的.给 ...
- Hdu 5442 Favorite Donut (2015 ACM/ICPC Asia Regional Changchun Online 最大最小表示法 + KMP)
题目链接: Hdu 5442 Favorite Donut 题目描述: 给出一个文本串,找出顺时针或者逆时针循环旋转后,字典序最大的那个字符串,字典序最大的字符串如果有多个,就输出下标最小的那个,如果 ...
- UvaLive6442(思维、结论)
结论是:按位置排序好以后,对于真正的答案,走法应该是:依次走向第0个等分点,第1个等分点……这样对于这种等分情况,是最优的调度. /* 先假设一个终点位置然后按位站好 这个位置不一定是最优所以要调 调 ...
- UVa 12186 Another Crisis 工人的请愿书
c表示某上司上报的最少请愿下属,k表示总下属c=0.01T*k=kT/100(0.01T*k是整数)c=[0.01T*k]+1=[kT/100]+1(0.01T*k不是整数) kT=100 c=1 k ...
- 线段树(单点更新) HDOJ 4288 Coder
题目传送门 #include <cstdio> #include <cstring> #define lson l, m, rt << 1 #define rson ...