# version 1.0
def connect_mysql(sql, oper_type="select", data_l=None):
conn = pymysql.connect(host='localhost', user="root", password="",
database="work", port=3306)
cur = conn.cursor()
if oper_type == "insert":
cur.executemany(sql, data_l)
conn.commit()
else:
cur.execute(sql)
result = cur.fetchall()
# print(type(result), "result")
conn.close()
return result def process_jobs(field_name):
sql = "select j." + field_name + " FROM personal_jobs j"
column_name = connect_mysql(sql, oper_type="select")
row_total = (len(column_name))
row_category = set(column_name) # init category dict
category_dict = {}
for k in row_category:
category_dict[k] = 0 # calculate amount
cal_nmu = 0
for k in row_category:
for r in column_name:
if r == k:
cal_nmu += 1
category_dict[k] = cal_nmu
cal_nmu = 0
print(type(category_dict.items()), category_dict.items())
print(row_total, len(category_dict.items()))
return row_total, category_dict process_jobs("job_salary")
version 1.1
def count_times(all_list):
ls = []
item_list = list(set(all_list))
for m in item_list:
c = all_list.count(m)
ls.append([m, c])
return sorted(ls) def process_salary(field_name):
# sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_sal = connect_mysql(sql)
# sort salary order
row_category = list(set(original_sal))
general_min, general_avg, general_max = [], [], []
# cal_num = 0
for sal in row_category:
# calculate category amount
# for cat in column_name:
# if cat == sal:
# cal_num += 1
# process salary
if field_name == "job_salary":
sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
general_min.append(int(sal_tmp[0]))
general_max.append(int(sal_tmp[1])) # process experience
if field_name == "job_exp":
print(original_sal) # initial again
# cal_num = 0 # calculate min sal
min_sal = count_times(general_min)
for m1 in min_sal:
min_s = str(m1[0]) + "K"
m1[0] = min_s # calculate max sal
max_sal = count_times(general_max)
for m2 in max_sal:
min_s = str(m2[0]) + "K"
m2[0] = min_s # calculate avg sal
avg_sal = count_times(original_sal)
print("original: ", avg_sal)
for a1 in avg_sal:
sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
avg_sal = sorted(avg_sal) for a2 in avg_sal:
a2[0] = str(a2[0]) + "K"
# debug
print(len(min_sal), min_sal)
print(len(avg_sal), avg_sal)
print(len(max_sal), max_sal)
return min_sal, avg_sal, max_sal # process_salary("job_salary")
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from scipy.misc import imread def process_reqirement(field_name):
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_req = connect_mysql(sql)
userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"] jieba.load_userdict(userdict)
# print(type(original_req), str(original_req))
text0 = Counter(jieba.cut(str(original_req)))
text1 = " ".join(jieba.cut(str(original_req)))
[item for item in sorted(text0.values())]
# print(text0.keys(), text0.values())
# print(type(text0), text0) # # create word cloud
# wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
# background_color="white", mask=imread("china.jpg")).generate(text1)
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show() # find requirement item what we really need
req_list = []
# print(len(text0.keys()), text0)
for k, v in text0.items():
for kk, vv in text0.items():
if str(k).lower() == str(kk).lower():
# print(k, v)
req_list.append([k, (v + vv)])
# print(k, v)
break
print(len(req_list), req_list) for t in userdict:
for k, v in text0.items():
if t.lower() == str(k).lower():
req_list.append([t, v])
break
# print(req_list)
return req_list
process_reqirement("job_requirement")
def user_defined(file_name):
user_list = []
with open(file_name, "r", encoding="utf8") as f:
for i in f:
user_list.append(i.strip())
return user_list def process_company(field_name):
sql = "select " + field_name + " from work.personal_jobs"
company = [list(i) for i in connect_mysql(sql)]
user_list = user_defined("t.txt")
user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
jieba.load_userdict(user_list)
me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
req_list, suit_list = [], []
for req in company:
req_dict = Counter(jieba.cut(req[1]))
req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
for r in req_list:
if len(r[1]) > 0:
# print(r[1])
own = [item for item in me_list if item in r[1]]
if len(own) > 0:
suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
return sorted(suit_list, key=lambda x: x[1])
# print(sorted(suit_list, key=lambda x: x[1]))
process_company("company_name, job_requirement")

process data的更多相关文章

  1. 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)

    不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...

  2. [CDH] Process data: integrate Spark with Spring Boot

    c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...

  3. Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices

    January 22, 2019Use Cases, Apache Flink Lasse Nedergaard     Recently there has been significant dis ...

  4. [AJAX系列]$.post(url,[data],[fn],[type])

    概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式  xml  html ...

  5. Data Science at the Command Line学习笔记(二)

    1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...

  6. [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.

    3.3  Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...

  7. Learn know more about big data

    As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...

  8. Monitoring and Tuning the Linux Networking Stack: Receiving Data

    http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...

  9. Big Data Analytics for Security(Big Data Analytics for Security Intelligence)

    http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...

随机推荐

  1. linux中用管道实现父子进程通信

    1 用户要实现父进程到子进程的数据通道,可以在父进程关闭管道读出一端, 然后相应的子进程关闭管道的输入端. 2 先用pipe()建立管道 然后fork函数创建子进程.父进程向子进程发消息,子进程读消息 ...

  2. hdoj1078【DP·记忆化搜索】

    还是满水的一道题目吧...这个一看肯定要搜索了..然后又是这么DP,那就是记忆化搜索了...走K步,下一步要比他多...很好写啊/// #include<iostream> #includ ...

  3. hdu2767(图的强连通)

    //题意:问需要添加几条边使得这张图成为每个点都等价(强连通图) 我们先把图中的强连通分量缩点 可能他本身就是满足条件,那么直接输出0 经过缩点后,就可以把强连通分量看成一个个独立的点,在这张图上搞一 ...

  4. bzoj 1023: [SHOI2008]cactus仙人掌图【tarjan+dp+单调队列】

    本来想先求出点双再一个一个处理结果写了很长发现太麻烦 设f[u]为u点向下的最长链 就是再tarjan的过程中,先照常处理,用最长儿子链和次长儿子链更新按ans,然后处理以这个点为根的环,也就是这个点 ...

  5. bzoj 1814: Ural 1519 Formula 1【插头dp】

    设f[i][j][s]为轮廓线推到格子(i,j),状态为s的方案数 括号表示一段线的左端和右端,表示成左括号和右括号,状压的时候用1和2表示,0表示已经闭合 下面的蓝线是黄色格子的轮廓线,dp转移要把 ...

  6. loj#2540. 「PKUWC2018」随机算法

    传送门 完了pkuwc咋全是dp怕是要爆零了-- 设\(f(S)\)表示\(S\)的排列数,\(S\)为不能再选的点集(也就是选到独立集里的点和与他们相邻的点),\(mx(S)\)表示\(S\)状态下 ...

  7. 初识Sklearn-IrisData训练与预测

    笔记:机器学习入门---鸢尾花分类 Sklearn 本身就有很多数据库,可以用来练习. 以 Iris 的数据为例,这种花有四个属性,花瓣的长宽,茎的长宽,根据这些属性把花分为三类:山鸢尾花Setosa ...

  8. ubuntu 下配置django 项目能够被局域网下的其他电脑访问

    在项目下的路径下下运行 python manage.py runserver 后面的端口换成其他可用的端口也可以 如何让外网也能访问呢,有待更新

  9. nginx memcache缓存

    1 基本 在一个lnmp架构中,nginx遇到动态资源,会反向代理,把请求发送到后端的php-fpm服务,php-fpm从mysql里读取数据,生产网页,然后返回给client. 如果流量大,php- ...

  10. New Land LightOJ - 1424

    New Land LightOJ - 1424 题意:找出01矩阵中最大的完全由0组成的矩阵. 方法: 重点在于转化. 先预处理(i,j)点向上最长能取到的连续的全0条的长度.然后枚举某一行作为矩阵的 ...