process data
- # version 1.0
def connect_mysql(sql, oper_type="select", data_l=None):- conn = pymysql.connect(host='localhost', user="root", password="",
- database="work", port=3306)
- cur = conn.cursor()
- if oper_type == "insert":
- cur.executemany(sql, data_l)
- conn.commit()
- else:
- cur.execute(sql)
- result = cur.fetchall()
- # print(type(result), "result")
- conn.close()
- return result
- def process_jobs(field_name):
- sql = "select j." + field_name + " FROM personal_jobs j"
- column_name = connect_mysql(sql, oper_type="select")
- row_total = (len(column_name))
- row_category = set(column_name)
- # init category dict
- category_dict = {}
- for k in row_category:
- category_dict[k] = 0
- # calculate amount
- cal_nmu = 0
- for k in row_category:
- for r in column_name:
- if r == k:
- cal_nmu += 1
- category_dict[k] = cal_nmu
- cal_nmu = 0
- print(type(category_dict.items()), category_dict.items())
- print(row_total, len(category_dict.items()))
- return row_total, category_dict
- process_jobs("job_salary")
- version 1.1
def count_times(all_list):- ls = []
- item_list = list(set(all_list))
- for m in item_list:
- c = all_list.count(m)
- ls.append([m, c])
- return sorted(ls)
- def process_salary(field_name):
- # sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
- sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
- original_sal = connect_mysql(sql)
- # sort salary order
- row_category = list(set(original_sal))
- general_min, general_avg, general_max = [], [], []
- # cal_num = 0
- for sal in row_category:
- # calculate category amount
- # for cat in column_name:
- # if cat == sal:
- # cal_num += 1
- # process salary
- if field_name == "job_salary":
- sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
- general_min.append(int(sal_tmp[0]))
- general_max.append(int(sal_tmp[1]))
- # process experience
- if field_name == "job_exp":
- print(original_sal)
- # initial again
- # cal_num = 0
- # calculate min sal
- min_sal = count_times(general_min)
- for m1 in min_sal:
- min_s = str(m1[0]) + "K"
- m1[0] = min_s
- # calculate max sal
- max_sal = count_times(general_max)
- for m2 in max_sal:
- min_s = str(m2[0]) + "K"
- m2[0] = min_s
- # calculate avg sal
- avg_sal = count_times(original_sal)
- print("original: ", avg_sal)
- for a1 in avg_sal:
- sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
- a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
- avg_sal = sorted(avg_sal)
- for a2 in avg_sal:
- a2[0] = str(a2[0]) + "K"
- # debug
- print(len(min_sal), min_sal)
- print(len(avg_sal), avg_sal)
- print(len(max_sal), max_sal)
- return min_sal, avg_sal, max_sal
- # process_salary("job_salary")
- import jieba
- from wordcloud import WordCloud
- import matplotlib.pyplot as plt
- from collections import Counter
- from scipy.misc import imread
- def process_reqirement(field_name):
- sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
- original_req = connect_mysql(sql)
- userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"]
- jieba.load_userdict(userdict)
- # print(type(original_req), str(original_req))
- text0 = Counter(jieba.cut(str(original_req)))
- text1 = " ".join(jieba.cut(str(original_req)))
- [item for item in sorted(text0.values())]
- # print(text0.keys(), text0.values())
- # print(type(text0), text0)
- # # create word cloud
- # wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
- # background_color="white", mask=imread("china.jpg")).generate(text1)
- # plt.imshow(wordcloud)
- # plt.axis("off")
- # plt.show()
- # find requirement item what we really need
- req_list = []
- # print(len(text0.keys()), text0)
- for k, v in text0.items():
- for kk, vv in text0.items():
- if str(k).lower() == str(kk).lower():
- # print(k, v)
- req_list.append([k, (v + vv)])
- # print(k, v)
- break
- print(len(req_list), req_list)
- for t in userdict:
- for k, v in text0.items():
- if t.lower() == str(k).lower():
- req_list.append([t, v])
- break
- # print(req_list)
- return req_list
- process_reqirement("job_requirement")
- def user_defined(file_name):
- user_list = []
- with open(file_name, "r", encoding="utf8") as f:
- for i in f:
- user_list.append(i.strip())
- return user_list
- def process_company(field_name):
- sql = "select " + field_name + " from work.personal_jobs"
- company = [list(i) for i in connect_mysql(sql)]
- user_list = user_defined("t.txt")
- user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
- jieba.load_userdict(user_list)
- me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
- req_list, suit_list = [], []
- for req in company:
- req_dict = Counter(jieba.cut(req[1]))
- req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
- for r in req_list:
- if len(r[1]) > 0:
- # print(r[1])
- own = [item for item in me_list if item in r[1]]
- if len(own) > 0:
- suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
- return sorted(suit_list, key=lambda x: x[1])
- # print(sorted(suit_list, key=lambda x: x[1]))
- process_company("company_name, job_requirement")
process data的更多相关文章
- 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)
不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...
- [CDH] Process data: integrate Spark with Spring Boot
c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...
- Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices
January 22, 2019Use Cases, Apache Flink Lasse Nedergaard Recently there has been significant dis ...
- [AJAX系列]$.post(url,[data],[fn],[type])
概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式 xml html ...
- Data Science at the Command Line学习笔记(二)
1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...
- [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.
3.3 Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...
- Learn know more about big data
As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...
- Monitoring and Tuning the Linux Networking Stack: Receiving Data
http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...
- Big Data Analytics for Security(Big Data Analytics for Security Intelligence)
http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...
随机推荐
- django上课笔记6-MVC,MTV架构-中间件-初识Form组件
一.MVC,MTV架构 models(数据库,模型) views(html模板) controllers(业务逻辑处理) --> MVC models(数据库,模型) templates(htm ...
- hadoop中的序列化
此文已由作者肖凡授权网易云社区发布. 欢迎访问网易云社区,了解更多网易技术产品运营经验. 最近在学习hadoop,发现hadoop的序列化过程和jdk的序列化有很大的区别,下面就来说说这两者的区别都有 ...
- poj1312dfs基础
就是很简单的DFS-因为数据偏小,上去就是干了 #include <stdio.h> #include <string.h> #include <math.h> # ...
- poj 1743 Musical Theme【二分+SA】
差分,然后二分长度mid,判断是把height按照min不小于mid分组,取最大最小的sa位置看是否>=mid即可,注意差分后最后答案要+1 #include<iostream> # ...
- 第二篇 .NET高级技术之密闭类和静态类及扩展方法
1.密闭类是修饰为sealed的类, sealed不能有子类.一般只有系统中的一些基本类声明为sealed.面试题:是否可以编写一个类继承自String类? 答:不能,因为string被声明为了sea ...
- tyvj 1391 走廊泼水节【最小生成树】By cellur925
题目传送门 题意简化:给你一棵树,要求你加边使它成为完全图(任意两点间均有一边相连) ,满足原来的树是这个图的最小生成树.求加边的价值最小是多少. 考虑Kruskal的过程,我们每次找一条最短的,两边 ...
- Codeforces Round #513解题报告(A~E)By cellur925
我是比赛地址 A:Phone Numbers $Description$:给你一串数字,问你能组成多少开头为8的11位电话号码. $Sol$:统计8的数量,与$n$%11作比较. #include&l ...
- Springboot配置类
配置类 MyAppConfig import com.test.springboot.service.HelloService; import org.springframework.context ...
- 递归(Recursion)
递归是一种非常常用的算法,分为“递”和“归”两个步骤.满足递归算法有三个条件:1.一个问题,可以分解为子问题:2.该问题,与分解后的子问题,解决思路一致:3.存在终止条件.案例演示:假设有n个台阶,每 ...
- POJ 1151 Atlantis(扫描线)
题目原链接:http://poj.org/problem?id=1151 题目中文翻译: POJ 1151 Atlantis Time Limit: 1000MS Memory Limit: 10 ...