hive学习8（小案例1练习）

创建数据库

hive> create database feigu;

hive> use feigu;

创建表

stg_job表

drop table if exists stg_job;

create table if not exists stg_job(

web_id string comment 'web id',

web_type string comment 'web type',

job_url string comment 'job url',

job_name string comment 'job name',

job_location string comment 'job location',

job_desc string comment 'job desc',

edu string comment 'education',

gender string comment 'gender',

language string comment 'language',

major string comment 'major',

work_year string comment 'work years',

salary string comment 'salary',

company_name string comment 'company name',

company_desc string comment 'company desc',

company_address string comment 'company address',

company_worktype string comment 'company worktype',

company_scale string comment 'company scale',

company_prop string comment 'company property',

company_website string comment 'company_website',

curl_timestamp string comment 'curl timestamp'

)

comment 'all flat data from webpage'

partitioned by (`pt` string comment 'job post date ')

row format delimited

fields terminated by '\001'

null defined as ''

stored as textfile;

s_job表(与stg_job相同的表结构)

create table s_job like stg_job;

stg_news表



drop table if exists stg_news;

create table if not exists stg_news(

mysql_newsid string,

news_title string,

content string,

create_time string

)

comment 'all flat thread from Dz'

partitioned by (`pt` string )

row format delimited

fields terminated by '\001'

null defined as ''

stored as textfile;

dm_job表



drop table if exists dm_job;

create table if not exists dm_job(

web_id string comment 'web id',

web_type string comment 'web type',

job_url string comment 'job url',

job_name string comment 'job name',

job_location string comment 'job location',

job_desc string comment 'job desc',

edu string comment 'education',

gender string comment 'gender',

language string comment 'language',

major string comment 'major',

work_year string comment 'work years',

salary string comment 'salary',

job_date string comment 'job date',

company_name string comment 'company name',

company_desc string comment 'company desc',

company_address string comment 'company address',

company_worktype string comment 'company worktype',

company_scale string comment 'company scale',

company_prop string comment 'company property',

company_website string comment 'company_website',

curl_timestamp string comment 'curl timestamp',

vip_flg string

)

comment 'compute vip '

partitioned by (`pt` string)

row format delimited

fields terminated by '\001'

null defined as ''

stored as sequencefile;

dim_edu表



drop table if exists dim_edu;

create table if not exists dim_edu(

web_type string,

job_name string,

company_name string,

edu_detail string,

edu_type string

)

comment 'edu dimision'

partitioned by (`pt` string)

row format delimited

fields terminated by '\001'

null defined as ''

stored as sequencefile;

dim_workyear表



drop table if exists dim_workyear;

create table if not exists dim_workyear(

web_type string,

job_name string,

company_name string,

workyear_detail string,

workyear_type string

)

comment 'work years'

partitioned by (`pt` string)

row format delimited

fields terminated by '\001'

null defined as ''

stored as sequencefile;

dim_joblocation表



drop table if exists dim_joblocation;

create table if not exists dim_joblocation(

web_type string,

job_name string,

company_name string,

joblocation_detail string,

joblocation_type string

)

comment 'job location'

partitioned by (`pt` string)

row format delimited

fields terminated by '\001'

null defined as ''

stored as sequencefile;

dim_salary表

drop table if exists dim_salary;

create table if not exists dim_salary(

web_type string,

job_name string,

company_name string,

salary_detail string,

salary_type string

)

comment 'job salary'

partitioned by (`pt` string)

row format delimited

fields terminated by '\001'

null defined as ''

stored as sequencefile;

数据导入

将爬虫爬取的职位表信息导入到stg_job表中

hive> load data local inpath '/home/data/daily/20150501/51job1.dat'

    > overwrite into table stg_job

    > partition (pt='20150501');

hive数据清洗（ETL）

数据项为空：网页抓取下来的数据可能是空的需要剔除
检索结果不一致：编码或命名差异，例如品牌=耐克，商品品牌=耐克
噪声：包含错误或者异常值，如salary='-100'

数据预处理分为数据清理，数据变换，数据集成

对job_location（工作地点）, edu（学历）, work_year（工作年限）,salary（薪资范围）4列数据的空值进行转换

hive> insert overwrite table s_job partition (pt)

    > select

    > web_id,web_type,job_url,job_name,

    > case when job_location is null or trim(job_location) = "" then "--" else job_location end

    > job_location,

    > job_desc,

    > case when edu is null or trim(edu) = "" then "--" else edu end

    > edu,

    > gender,language,major,

    > case when work_year is null or trim(work_year) =  "" then "--" else work_year end

    > work_year,

    > case when salary is null or trim(salary) = "" then "--" else salary end

    > salary,

    > company_name,company_desc,company_address,

    > company_worktype,company_scale,company_prop,company_website,curl_timestamp,

    > pt from stg_job

    > where pt="20150501";

代码注释

insert overwrite table...partition (...) select 将查询结构集写入另一个表中
partition(pt):在目标表中使用了动态分区，会在s_job表中自动创建分区
overwrite会先删除s_job中pt='20150501'分区的数据，避免相同分区下的数据重复导入

hive提取维度信息

抽取“学历要求”维度信息插入到学历维度表

hive> insert into table dim_edu partition (pt)

    > select web_type,job_name,company_name,

    > edu as edu_detail,

    > case

    > when (edu like '%大专%' = true or edu like '%专科%' = true) then 'B1'

    > when (edu like '%本科%' = true) then 'B2'

    > when (edu like '%硕士%' = true or edu like '%研究生%' = true) then 'B3'

    > else 'B9'

    > end

    > as edu_type,

    > pt

    > from s_job where s_job.pt='20150501';

抽取“工作地点”维度信息插入到工作地点维度表

hive> insert into table dim_joblocation partition (pt)

    > select web_type,job_name,company_name,

    > job_location as joblocation_detail,

    > case

    > when (job_location like '%北京%' = true) then 'A1'

    > when (job_location like '%上海%' = true) then 'A2'

    > when (job_location like '%广州%' = true) then 'A3'

    > when (job_location like '%深圳%' = true) then 'A4'

    > else 'A9'

    > end

    > as joblocation_type,

    > pt

    > from s_job where s_job.pt='20150501';

hive学习8（小案例1练习）的更多相关文章

hive学习(五) 应用案例
1.实现struct数据结构例子 1.1创建student表 create table student( id int, info struct<name:string,age:int> ...
Hive学习系列博客
原 Hive作业优化原 Hive学习六:HIVE日志分析(用户画像) 原 Hive学习五--日志案例分析原 Hive学习三原 Hive学习二原 Hive学习一博客来源,https://blo ...
Vue小案例之商品管理------学习过滤器使用过滤器处理日期的格式
代码学习过滤器过滤器介绍:过滤模型数据,在数据显示前做预处理操作: 内置过滤器:在1.x中,Vue提供了内置过滤器,但是在2.x中已经完全废除: 解决办法: (1)使用第三方库来替代1.x中的内置过 ...
JavaScript_DOM学习篇_图片切换小案例
今天开始学习DOM操作,下面写一个小案例来巩固下知识点. DOM: document object model (文档对象模型) 根据id获取页面元素 : 如: var xx = document.g ...
[jQuery学习系列六]6-jQuery实际操作小案例
前言最后在这里po上jQuery的几个小案例. Jquery例子1_占位符使用需求: 点击第一个按钮后自动去check 后面是否有按钮没有选中, 如有则提示错误消息. <html> &l ...
【大数据】Hive学习笔记
第1章 Hive基本概念 1.1 什么是Hive Hive:由Facebook开源用于解决海量结构化日志的数据统计. Hive是基于Hadoop的一个数据仓库工具,可以将结构化的数据文件映射为一张表, ...
hive学习
大数据的仓库Hive学习 10期-崔晓光 2016-06-20 大数据 hadoop 10原文链接我们接着之前学习的大数据来学习.之前说到了NoSql的HBase数据库以及Hadoop中 ...
Hive学习路线图(转)
Hadoophivehqlroadmap学习路线图 1 Comment Hive学习路线图 Hadoop家族系列文章,主要介绍Hadoop家族产品,常用的项目包括Hadoop, Hive, Pig ...
【转】Hive学习路线图
原文博客出自于:http://blog.fens.me/hadoop-hive-roadmap/ 感谢! Hive学习路线图 Hadoop家族系列文章,主要介绍Hadoop家族产品,常用的项目包括Ha ...
MVC 小案例 -- 信息管理
前几次更新博客都是每次周日晚上到周一,这次是周一晚上开始写,肯定也是有原因的!那就是我的 Tomact 忽然报错,无法启动,错误信息如下!同时我的 win10 也崩了,重启之后连 WIFI 的标志也不 ...

随机推荐

windows下兼容Python2和Python3
windows下同时安装了python2和python3时,都可以配置环境变量,如果在命令行里输入python命令,windows会去环境变量里寻找Python的安装位置,如果先找到pytoon2的, ...
【BZOJ3638】Cf172 k-Maximum Subsequence Sum 线段树区间合并（模拟费用流）
[BZOJ3638]Cf172 k-Maximum Subsequence Sum Description 给一列数,要求支持操作: 1.修改某个数的值 2.读入l,r,k,询问在[l,r]内选不相交 ...
使用RestTemplate post方式提交表单数据
HttpHeaders headers = new HttpHeaders(); headers.setContentType(MediaType.APPLICATION_FORM_URLENCODE ...
1119 机器人走方格 V2(组合)
1119 机器人走方格 V2 基准时间限制:1 秒空间限制:131072 KB 分值: 10 难度:2级算法题 M * N的方格,一个机器人从左上走到右下,只能向右或向下走.有多少种不同的走法?由于 ...
SharePoint服务器端对象模型之访问网站和列表数据（Part 5）
(五)列表条目(SPListItem) SharePoint中数据的存储基本上都是通过列表条目来完成(文档库中的文档也是一种特殊的列表条目),因此在SharePoint应用开发中,最终是要和列表条目打 ...
openssl之EVP系列之7---信息摘要算法结构概述
openssl之EVP系列之7---信息摘要算法结构概述 ---依据openssl doc/crypto/EVP_DigestInit.pod翻译和自己的理解写成 (作者:Dragon ...
剑指offer 面试62题
面试62题: 题目:圆圈中最后剩下的数字题:0,1,...,n-1这n个数字排成一个圆圈,从数字0开始,每次从这个圆圈里删除第m个数字.求出这个圆圈里剩下的最后一个数字. 解题思路:约瑟夫环问题,可 ...
hadoop本地运行与集群运行
开发环境: windows10+伪分布式(虚拟机组成的集群)+IDEA(不需要装插件) 介绍: 本地开发,本地debug,不需要启动集群,不需要在集群启动hdfs yarn 需要准备什么: 1/配置w ...
MVC，MVP和MVVM区别
复杂的软件必须有清晰合理的架构,否则无法开发和维护. MVC(Model-View-Controller)是最常见的软件架构之一,业界有着广泛应用.它本身很容易理解,但是要讲清楚,它与衍生的 MVP ...
微服务架构~BFF和网关是如何演化出来的
介绍 BFF(Backend for Frontend)和网关Gateway是微服务架构中的两个重要概念,这两个概念相对比较新,有些开发人员甚至是架构师都不甚理解. 本文用假想的公司案例+图示的方式, ...

hive学习8（小案例1练习）

创建数据库

创建表

数据导入

将爬虫爬取的职位表信息导入到stg_job表中

hive数据清洗（ETL）

对job_location（工作地点）, edu（学历）, work_year（工作年限）,salary（薪资范围）4列数据的空值进行转换

hive提取维度信息

hive学习8（小案例1练习）的更多相关文章

随机推荐

热门专题