Hive| ETL清洗& 查询练习
ETL清洗数据
导Jar包
<dependencies>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency> </dependencies>
ETLUtil.java
public class ETLUtil {
public static String etl(String original){
StringBuilder stringBuilder = new StringBuilder();
String[] fields = original.split("\t");
if (fields.length < 9){
return null;
}
//日志合规
//替换空格
fields[3] = fields[3].replace(" ", "");
for (int i = 0; i < fields.length - 1; i++){
if (i == fields.length - 1){
stringBuilder.append(fields[i]); }else if (i < 9){
stringBuilder.append(fields[i]).append("\t");
}else {
stringBuilder.append(fields[i]).append("&");
}
}
return stringBuilder.toString();
}
}
ETLMapper.java
public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String original = value.toString();
String etlString = ETLUtil.etl(original);
if (StringUtils.isNotEmpty(etlString)){
k.set(etlString);
context.write(k, NullWritable.get());
context.getCounter("ETL", "True").increment(1); }else {
context.getCounter("ETL", "False").increment(1);
}
}
}
ETLDriver.java
public class ETLDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration()); job.setJarByClass(ETLDriver.class);
job.setMapperClass(ETLMapper.class);
job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir -p /guli/user
[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir /guli/video
[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir /guli/etl
[kris@hadoop102 datas]$ hadoop fs -moveFromLocal user.txt /guli/user
[kris@hadoop102 datas]$ hadoop fs -moveFromLocal *.txt /guli/video
[kris@hadoop102 hadoop-2.7.]$ hadoop jar ETLVideo.jar com.atguigu.etl.ETLDriver /guli/video /guli/video_etl
ETL
False=
True=
创建表:
create external table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile
location '/guli/video_etl'; create external table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile
location '/guli/user'; create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc; create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc; : jdbc:hive2://hadoop101:10000> insert into table gulivideo_orc select * from gulivideo_ori;
: jdbc:hive2://hadoop101:10000> insert into table gulivideo_user_orc select * from gulivideo_user_ori;
1.--统计视频观看数Top10
select videoid, uploader, views from gulivideo_orc
order by views desc limit 10;
+--------------+------------------+-----------+--+
| videoid | uploader | views |
+--------------+------------------+-----------+--+
| dMH0bHeiRNg | judsonlaipply | 42513417 |
| 0XxI-hvPRRA | smosh | 20282464 |
| 1dmVU08zVpA | NBC | 16087899 |
| RB-wUgnyGv0 | ChrisInScotland | 15712924 |
| QjA5faZF1A8 | guitar90 | 15256922 |
| -_CSo1gOd48 | tasha | 13199833 |
| 49IDp76kjPw | TexMachina | 11970018 |
| tYnn51C3X_w | CowSayingMoo | 11823701 |
| pv5zWaTEVkI | OkGo | 11672017 |
| D2kJZOfq7zk | mrWoot | 11184051 |
+--------------+------------------+-----------+--+
10 rows selected (22.612 seconds)
使用group by的两个要素:
(1) 出现在select后面的字段 要么是是聚合函数中的,要么就是group by 中的.
(2) 要筛选结果 可以先使用where 再用group by 或者先用group by 再用having --2.统计视频类别热度Top10 (类别的videoid--视频的唯一id越多就代表热度高, 类别排序的多少排序;不能分组分组是在组内排序)
①统计视频类别:
select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories
②按类别的热度排名
select t1.videoid, t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1
group by t1.categories order by num desc limit 10;
--->拼一块:t1.videoid不能出现在select后边,
select t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1
group by t1.categories order by num desc limit 10; +----------------+---------+--+
| t1.categories | num |
+----------------+---------+--+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Animation | 73293 |
| Film | 73293 |
| Sports | 67329 |
| Gadgets | 59817 |
| Games | 59817 |
| Blogs | 48890 |
| People | 48890 |
+----------------+---------+--+
10 rows selected (70.01 seconds)
3.--统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数 //所有类别中包含Top20视频的个数 //Expression not in GROUP BY key 'videoid'
not in GROUP BY key 'views',后边有views,select后必须加views
############
①观看数最高的20个视频:
select videoid, category, views from gulivideo_orc order by views desc limit
②把类别category炸开--所属类别
select videoid, categories, views from t1 lateral view explode(category) tbl categories
--->前两句合起:
select t1.videoid, categories, t1.views from (select videoid, category, views from gulivideo_orc order by views desc limit
) t1 lateral view explode(category) tbl as categories;
+--------------+----------------+-----------+--+
| t1.videoid | categories | t1.views |
+--------------+----------------+-----------+--+
| dMH0bHeiRNg | Comedy | |
| 0XxI-hvPRRA | Comedy | |
| 1dmVU08zVpA | Entertainment | |
| RB-wUgnyGv0 | Entertainment | |
| QjA5faZF1A8 | Music | |
| -_CSo1gOd48 | People | |
| -_CSo1gOd48 | Blogs | |
| 49IDp76kjPw | Comedy | |
| tYnn51C3X_w | Music | |
| pv5zWaTEVkI | Music | |
| D2kJZOfq7zk | People | |
| D2kJZOfq7zk | Blogs | |
| vr3x_RRJdd4 | Entertainment | |
| lsO6D1rwrKc | Entertainment | |
| 5P6UU6m3cqk | Comedy | |
| 8bbTtPL1jRs | Music | |
| _BuRwH59oAo | Comedy | |
| aRNzWyD7C9o | UNA | |
| UMf40daefsI | Music | |
| ixsZy2425eY | Entertainment | |
| MNxwAU_xAMk | Comedy | |
| RUCZJVJ_M8o | Entertainment | |
+--------------+----------------+-----------+--+
③类别中包含top20的视频的个数:在上条基础上加上按类别分组,计数组内videoid计数
--->
select categories, count(videoid) from (select videoid, category, views from gulivideo_orc order by views desc limit
) t1 lateral view explode(category) tbl as categories group by categories
+----------------+------+--+
| categories | _c1 |
+----------------+------+--+
| Blogs | |
| Comedy | |
| Entertainment | |
| Music | |
| People | |
| UNA | |
+----------------+------ -- over里边不能使用limit, 怎么获取分区排序前几个呢?需要使用一个子查询;分区是数据存储上的分子文件,查询时还是在一张表
select t1.videoid, t1.views, t1.ran, t1.categories from(
select videoid, views, categories, rank() over(partition by categories order by views desc) ran
from gulivideo_orc lateral view explode(category) tbl as categories) t1
where t1.ran <= ;
+--------------+-----------+---------+----------------+--+
| t1.videoid | t1.views | t1.ran | t1.categories |
+--------------+-----------+---------+----------------+--+
| 2GWPOPSXGYI | | | Animals |
| xmsV9R8FsDA | | | Animals |
| 12PsUW-8ge4 | | | Animals |
| OeNggIGSKH8 | | | Animals |
| WofFb_eOxxA | | | Animals |
| sdUUx5FdySs | | | Animation |
| 6B26asyGKDo | | | Animation |
| H20dhY01Xjk | | | Animation |
| 55YYaJIrmzo | | | Animation |
| JzqumbhfxRo | | | Animation |
| RjrEQaG5jPM | | | Autos
......
.--统计视频观看数Top50所关联视频的所属类别排序
Top50---relatedid---种类---; 炸开之后直接join,因它是张虚拟表,hive是不支持的
select videoid, views, relatedid from gulivideo_orc order by views desc limit 50
炸开单独写一个sql: t1 select distinct(tbl.relatedids) rid from t1 lateral view explode(relatedid) tbl as relatedids
自己join自己下: t2 select g.videoid, g.category from t2 left join gulivideo_orc g on t2.vid=g.videoid
把category炸开并排序:select cateegories, count(videoid) hot from t3 lateral view explode(category) tb12 as catogories group by categores order by hot desc; select categories, count(videoid) hot
from(select g.videoid, g.category
from(select distinct(tbl.relatedids) rid
from(select videoid, views, relatedid
from gulivideo_orc order by views desc limit ) t1 lateral view explode(relatedid) tbl as relatedids) t2
join gulivideo_orc g on t2.rid=g.videoid) t3 lateral view explode(category) tbl2 as categories
group by categories order by hot desc;
+----------------+------+--+
| categories | hot |
+----------------+------+--+
| Comedy | |
| Entertainment | |
| Music | |
| Blogs | |
| People | |
| Film | |
| Animation | |
| News | |
| Politics | |
| Games | |
| Gadgets | |
| Sports | |
| Places | |
| UNA | |
| Travel | |
| Howto | |
| DIY | |
| Animals | |
| Pets | |
| Autos | |
| Vehicles | |
+----------------+------+--+
rows selected (115.239 seconds)
.--统计每个类别中的视频热度Top10,以Music为例
创建类别表:
create table gulivideo_category(
videoid string, uploader string, age int, categoryid string, length int, views int, rate float,
ratings int, comments int, relatedid array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
插入数据:
insert into table gulivideo_category
select videoid, uploader, age, categoryid, length, views, rate, ratings, comments, relatedid
from gulivideo_orc lateral view explode(category) category as categoryid;
--->把一张表全查出来:
select categoryid, videoid, paiming from (
select categoryid, videoid, rank() over(partition by categoryid order by views desc) paiming from gulivideo_category) t1
where t1.paiming <= 10; select categoryid, videoid, views
from gulivideo_category
where categoryid="music"
order by views desc limit 10; .--统计每个类别中视频流量Top10,以Music为例
select videoid, ratings
from gulivideo_category
where categoryid="music"
order by ratings desc limit ; .--统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频 ①上传视频最多的用户Top10:
select videos,uploader
from gulivideo_user_orc
order by videos desc limit ;
②找出这10个人上传的视频
select g.videoid, rank() over(partition by g.uploader order by g.views desc) hot from t1 join gulivideo_orc g on t1.uploader = g.uploader
③找出前20
select t2.uploader, t2.videoid from t2 where t2.hot <= 20;
select t2.uploader, t2.videoid from(
select g.uploader, g.videoid, g.views, rank() over(partition by g.uploader order by g.views desc) hot from
(select uploader,videos
from gulivideo_user_orc
order by videos desc limit ) t1
left join gulivideo_orc g on t1.uploader=g.uploader) t2
where t2.hot <= ;
+----------------+--------------+--+
| t2.uploader | t2.videoid |
+----------------+--------------+--+
| NULL | NULL |
| NULL | NULL |
| NULL | NULL |
| NULL | NULL |
| Ruchaneewan | xbYyjUdhtJw |
| Ruchaneewan | 4dkKeIUkN7E |
| Ruchaneewan | qCfuQA6N4K0 |
| Ruchaneewan | TmYbGQaRcNM |
| Ruchaneewan | dOlfPsFSjw0 |
| expertvillage | -IxHBW0YpZw |
| expertvillage | BU-fT5XI_8I |
| expertvillage | ADOcaBYbMl0 |
... .--统计每个类别视频观看数Top10 select t.categoryid, t.videoid, t.ranking
from(
select categoryid, videoid, rank() over(partition by categoryid order by views desc) ranking
from gulivideo_category) t
where t.ranking <= ; +----------------+--------------+------------+--+
| t.categoryid | t.videoid | t.ranking |
+----------------+--------------+------------+--+
| Animals | 2GWPOPSXGYI | |
| Animals | xmsV9R8FsDA | |
| Animals | 12PsUW-8ge4 | |
| Animals | OeNggIGSKH8 | |
| Animals | WofFb_eOxxA | |
| Animals | AgEmZ39EtFk | |
| Animals | a-gW3RbJd8U | |
| Animals | 8CL2hetqpfg | |
| Animals | QmroaYVD_so | |
| Animals | Sg9x5mUjbH8 | |
| Animation | sdUUx5FdySs | |
| Animation | 6B26asyGKDo | |
| Animation | H20dhY01Xjk | |
| Animation | 55YYaJIrmzo | |
| Animation | JzqumbhfxRo | |
| Animation | eAhfZUZiwSE | |
| Animation | h7svw0m-wO0 | |
| Animation | tAq3hWBlalU | |
| Animation | AJzU3NjDikY | |
| Animation | ElrldD02if0 | |
| Autos | RjrEQaG5jPM | |
......
rows selected (24.379 seconds)
1.分组TOPN选出今年每个学校,每个年级,每个科目分数前三.
: 时间,学校,年级,姓名,科目,成绩
建表
create external table score_test(school string, grade string, name string, subject string, score int)
partitioned by (year string)
row format delimited fields terminated by ','
stored as textfile
location '/hive_data';
stored as textfile ##把它放后边报错
select t1.name, t1.subject, t1.ran from(select name, subject, row_number() over(partition by school, grade, subject order by score desc) ran
from score_test where year="2013") t1 where t1.ran <= 3;
2. 今年 清华 1年级 总成绩大于200分的学生 以及学生数 ||多个字段的group by,还要按name分
select school, grade, name, sum(score) sum_score, count(1) over() num from score_test
where year = "2013" and school="清华" and grade="1"
group by school, grade, name having sum_score > 200;
3.
CREATE TABLE transaction_details (cust_id INT, amount FLOAT, month STRING, country STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ ;
建按月份的分区表:
create table transaction_details(cust_id int, amount float, month string, country string)
partitioned by(month string)
row format delimited fields terminated by ','; 每个月的总收入:
select cust_id, sum(amount) over(partition by month) as total from transaction_details; 4. 将内部表a,转换成外部表:
alter tale a set tblproperties ('external'='true'); 5.订单详情表ord_det(order_id订单号,sku_id商品编号,sale_qtty销售数量,dt日期分区)
任务计算2016年1月1日商品销量的Top100,并按销量降级排序
select order_id, sale_qtty from ord_det
where dt = "20160101" order by sale_qtty desc limit 100;
STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:
数据样例:2017-01-01,10029028,1000003251,33.57
1) 给出2017年每个月的订单数、用户数、总成交金额
一个分区中的数据肯定很大,不要用distinct,用group by user_id做一个子查询再count(user_id)
select count(user_id) from (select user_id from stg.order group by user_id);
select count(order_id) order_count, count(distinct(user_id)) user_count, sum(amount) all, substring(date, 1, 7) month from stg.order
where substring(date, 1, 4)=''
group by month; 2) 给出2017年11月的新客数(指在11月才有第一笔订单)。
select count(1) from
(select order_id, lag(date, 1) over(partition by user_id order by date) fistOrder from stg.order) t1
where substring(date, 1, 7) = '2017-11' and fistOrder is null;
蚂蚁森林植物申领统计
===================================================================
表1:user_low_carbon表记录了用户每天的蚂蚁森林低碳生活领取的记录流水
user_id(int) data_dt(string) low_carbon
用户 日期 减少碳排放(g) 数据样例:
user_id | date_dt | low_carbon
————————————————————
u_001 | 2017/1/1 | 10
u_001 | 2017/1/2 | 150
u_001 | 2017/1/2 | 110
u_001 | 2017/1/2 | 10
u_001 | 2017/1/4 | 50
u_001 | 2017/1/4 | 10
u_001 | 2017/1/6 | 45
u_001 | 2017/1/6 | 90
u_002 | 2017/1/1 | 10
u_002 | 2017/1/2 | 150
u_002 | 2017/1/2 | 70
u_002 | 2017/1/3 | 30
u_002 | 2017/1/3 | 80
u_002 | 2017/1/4 | 150
u_002 | 2017/1/5 | 101
u_002 | 2017/1/6 | 68 ================================================================
表2:plant_carbon表,用于记录申领环保植物所需要减少的碳排放量 plant_id(int) plant_name low_carbon
植物编号 植物名 换购植物所需要的碳 数据样例:
plant_id | plant_name | plant_carbon
————————————————————
p001 | 梭梭树 | 17
p002 | 沙柳 | 19
p003 | 樟子树 | 146
p004 | 胡杨 | 215
================================================================
题目一
蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗“p004-胡杨”,
剩余的能量全部用来领取“p002-沙柳”。
统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
得到的统计结果如下表样式:
user_id plant_count less_count(比后一名多领了几颗沙柳)
u_101 1000 100
u_088 900 400
u_103 500 … 1.累计能量排名前10的用户信息,取日期在10月1日之前的|按用户id分组|总能量排序|取-过滤前11:
t1
select user_id, sum(low_carbon) sum_carbon from user_low_carbon
where datediff(regexp_replace("2017/10/1", "/", "-"), regexp_replace(date_dt, "/", "-")) > 0
group by user_id order by sum_carbon desc limit 11;
2.胡杨的能量
t2
select plant_carbon huyang from plant_carbon where plant_name = "胡杨";
3.杨柳的能量
t3
select plant_carbon yangliu from plant_carbon where plant_name = "杨柳";
4.能领取的杨柳个数num
t4
select floor((sum_carbon-huyang)/yangliu) plant_count from t1, t2, t3; 5.他比后一名的人多领取的数,用lead往后第n行数据,把它们做比较的放在同1行;
t5
select lead(sum_carbon, 1, 0) over(sort by sum_carbon desc) plant_count2 from t4; 6.做比较
select user_id,plant_count, (plant_count - plant_count2) less_count from t5 limit10; =================================================================
题目二
蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,
每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g:
seq(key) user_id data_dt low_carbon
xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101 1.过滤用户在2007年中,碳排放量超过100g能量的用户;
按在2007年的| 用户id、日期(因为不同行有可能是一个日期)进行分组| 选择每天的能量>100的;
t1
select user_id, date_dt, sum(low_carbon) sum_day from user_low_carbon where substring(data_dt, 1, 4) year = "2017"
group by user_id, data_dt order by user_id, data_dt having sum_day > 100 2.每条数据的日期以及前两条和后两条数据的日期
t2
select user_id,
data_dt,
lag(data_dt, 2, "2000/1/1") over(partition by user_id) lag_2,
lag(data_dt, 1, "2000/1/1") over(partition by user_id) lag_1,
lead(data_dt, 2, "2000/1/1") over(partition by user_id) lead_2,
lead(data_dt, 1, "2000/1/1") over(partition by user_id) lead_1
from t1;
3.计算当前日期与前后两条数据的日期差
t3:
select user_id, data_dt
datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_2", "/", "-")) lag2,
datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_1", "/", "-")) lag1,
datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_2"), "/", "-") lead2,
datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_1", "/", "-")) lead1
from t2; 4.连续3天有三种情况:
①当前日期和前一天日期差为1,当前日期和前两天的日期差为2;
②当前日期和前一天日期差为1,当前日期和后一天的日期差为-1;
③当前日期和后一天的日期差为-1,当前日期和后两天的日期差为-2;
t4:
select user_id, data_dt from t3
where (lag1=1 and lag2=2) or (lag1=1 and lead1=-1) or (lead1=-1 and lead2=-2); 5.最后的结果
select t5.user_id, t5.data_dt, t5.low_carbon from user_low_carbon t5
inner join t4 on t4.user_id = t5.user_id
where t4.user_id = t5.user_id and t4.data_dt = t5.date_dt; ======================================================
注:
涉及到的hive函数
====================================
1:regexp_replace(arg1,arg2,arg3)
arg1:被替换字符串的正则表达式
arg2:被替换的字符
arg3:被换成的字符
e.g. :regexp_replace("2017/1/4","/","-")=2017-1-4
=====================================
2:datediff(arg1,arg2)
arg1:日期1
arg2:日期2
e.g.:datediff("2017-1-6","2017-1-5")=1
Hive| ETL清洗& 查询练习的更多相关文章
- hive php连接查询
baidu hive php PHP连接Hive执行sql查询 php通过 thrift访问hadoop的hive php开发Hive Web查询 php连接hive执行sql查询 利用python将 ...
- hive的join查询
hive的join查询 语法 join_table: table_reference [INNER] JOIN table_factor [join_condition] | table_refere ...
- Hive[6] HiveQL 查询
6.1 SELECT ... FROM 语句 hive> SELECT name,salary FROM employees; --普通查询 hive>SELECT e.n ...
- Hive之数据查询
Hive之数据查询 发布于:2013 年 10 月 11 日 由 Aaron发布于: Hive 一,排序和聚合 对于排序有两种方式,一种是order by 一种是sort by order by 会对 ...
- hive 将hive表数据查询出来转为json对象和json数组输出
一.将hive表数据查询出来转为json对象输出 1.将查询出来的数据转为一行一行,并指定分割符的数据 2.使用UDF函数,将每一行数据作为string传入UDF函数中转换为json再返回 1.准备数 ...
- [转]Hive:简单查询不启用Mapreduce job而启用Fetch task
转自:http://www.iteblog.com/archives/831 如果你想查询某个表的某一列,Hive默认是会启用MapReduce Job来完成这个任务,如下: hive> SEL ...
- hive的高级查询(group by、 order by、 join 、 distribute by、sort by、 clusrer by、 union all等)
查询操作 group by. order by. join . distribute by. sort by. clusrer by. union all 底层的实现 mapreduce 常见的聚合操 ...
- kylin对接hive实现实时查询
前提: 安装kylin之前,需要安装hadoop2.0.hbase.hive,并且对版本有要求,可以参照官网链接 http://kylin.apache.org/cn/docs/install/in ...
- python操作hive并且获取查询结果scheam
执行hive -e 命令并且获取对应的select查询出来的值及其对应的scheam字段 需要在执行语句中前部添加 set hive.cli.print.header=true; 这个设置,如下语句: ...
随机推荐
- 【原创】大数据基础之Airflow(2)生产环境部署airflow研究
一 官方 airflow官方分布式部署结构图 airflow进程 webserver scheduler flower(非必须) worker airflow缺点 scheduler单点 通过在sch ...
- select+异步
IO多路复用是指内核一旦发现进程指定的一个或者多个IO条件准备读取,它就通知该进程.IO多路复用适用如下场合: 当客户处理多个描述符时(一般是交互式输入和网络套接口),必须使用I/O复用. 当一个客户 ...
- iOS Core Data 数据库的加密(待研究)
https://github.com/project-imas/encrypted-core-data 使用起来很方便,底层还是使用了SQLCipher,有时间要研究一下! 数据库的密码不能用固定字符 ...
- FTRL优化算法
飞机票 FTRL
- 处理:“ORA-28002: the password will expire within 7 days”的问题
一:问题描述: 二:处理步骤 [oracle@localhost 2018_07_14]$ rlwrap sqlplus / as sysdba; SQL*Plus: Release 11.2.0.3 ...
- 使用pm2离线部署nodejs项目
1.下载https://npm.taobao.org/mirrors/node/v8.11.1/node-v8.11.1-linux-x64.tar.xz 比如安装到/opt目录 xz -d node ...
- 对<tr><td>标签里的input 循环取值
需求描述:单击table整行,跳转到具体的信息页面 关键就是获取整行的id,传给后台做查询,返回list 解决思路:用带参数函数传过去id,然后在js的函数中用$("#id"). ...
- C++ GetSystemDirectory()
关于GetSystemDirectory function,参考:https://msdn.microsoft.com/en-us/library/windows/desktop/ms724373(v ...
- 控制台操作mysql常用命令
总结: 1. 控制台链接mysql mysql -u lzpddd -pmypassword -h -D mydb -S /opt/mysql/data/mysql//mysql.sock 2.
- 网站申请HTTPS 访问
#生成证书和key openssl req -x509 -nodes -days 36500 -newkey rsa:2048 -keyout /opt/nginx/pdk.key -out /opt ...