ETL清洗数据

导Jar包

<dependencies>

        <dependency>

            <groupId>log4j</groupId>

            <artifactId>log4j</artifactId>

            <version>RELEASE</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-client</artifactId>

            <version>2.7.2</version>

        </dependency>

    </dependencies>

ETLUtil.java

public class ETLUtil {

    public static String etl(String original){

        StringBuilder stringBuilder = new StringBuilder();

        String[] fields = original.split("\t");

        if (fields.length < 9){

            return null;

        }

        //日志合规

        //替换空格

        fields[3] = fields[3].replace(" ", "");

        for (int i = 0; i < fields.length - 1; i++){

            if (i == fields.length - 1){

                stringBuilder.append(fields[i]);

            }else if (i < 9){

                stringBuilder.append(fields[i]).append("\t");

            }else {

                stringBuilder.append(fields[i]).append("&");

            }

        }

        return stringBuilder.toString();

    }

}

ETLMapper.java

public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    Text k = new Text();

    @Override

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String original = value.toString();

        String etlString = ETLUtil.etl(original);

        if (StringUtils.isNotEmpty(etlString)){

            k.set(etlString);

            context.write(k, NullWritable.get());

            context.getCounter("ETL", "True").increment(1);

        }else {

            context.getCounter("ETL", "False").increment(1);

        }

    }

}

ETLDriver.java

public class ETLDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(ETLDriver.class);

        job.setMapperClass(ETLMapper.class);

        job.setNumReduceTasks(0);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);

        System.exit(b ? 0 : 1);

    }

}

[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir -p /guli/user

[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir /guli/video

[kris@hadoop102 hadoop-2.7.]$ hadoop fs -mkdir /guli/etl

[kris@hadoop102 datas]$ hadoop fs -moveFromLocal user.txt /guli/user

[kris@hadoop102 datas]$ hadoop fs -moveFromLocal *.txt /guli/video

[kris@hadoop102 hadoop-2.7.]$ hadoop jar ETLVideo.jar com.atguigu.etl.ETLDriver /guli/video /guli/video_etl

 ETL

                False=

                True=

创建表：

create external table gulivideo_ori(

    videoId string,

    uploader string,

    age int,

    category array<string>,

    length int,

    views int,

    rate float,

    ratings int,

    comments int,

    relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as textfile

location '/guli/video_etl';

create external table gulivideo_user_ori(

    uploader string,

    videos int,

    friends int)

row format delimited

fields terminated by "\t"

stored as textfile

location '/guli/user';

create table gulivideo_orc(

    videoId string,

    uploader string,

    age int,

    category array<string>,

    length int,

    views int,

    rate float,

    ratings int,

    comments int,

    relatedId array<string>)

row format delimited fields terminated by "\t"

collection items terminated by "&"

stored as orc;

create table gulivideo_user_orc(

    uploader string,

    videos int,

    friends int)

row format delimited

fields terminated by "\t"

stored as orc;

: jdbc:hive2://hadoop101:10000> insert into table gulivideo_orc select * from gulivideo_ori;

: jdbc:hive2://hadoop101:10000> insert into table gulivideo_user_orc select * from gulivideo_user_ori;

1.--统计视频观看数Top10

select videoid, uploader, views from gulivideo_orc

order by views desc limit 10;

+--------------+------------------+-----------+--+

| videoid | uploader | views |

+--------------+------------------+-----------+--+

| dMH0bHeiRNg | judsonlaipply | 42513417 |

| 0XxI-hvPRRA | smosh | 20282464 |

| 1dmVU08zVpA | NBC | 16087899 |

| RB-wUgnyGv0 | ChrisInScotland | 15712924 |

| QjA5faZF1A8 | guitar90 | 15256922 |

| -_CSo1gOd48 | tasha | 13199833 |

| 49IDp76kjPw | TexMachina | 11970018 |

| tYnn51C3X_w | CowSayingMoo | 11823701 |

| pv5zWaTEVkI | OkGo | 11672017 |

| D2kJZOfq7zk | mrWoot | 11184051 |

+--------------+------------------+-----------+--+

10 rows selected (22.612 seconds)

使用group by的两个要素:

(1) 出现在select后面的字段 要么是是聚合函数中的,要么就是group by 中的.

(2) 要筛选结果 可以先使用where 再用group by 或者先用group by 再用having

--2.统计视频类别热度Top10 (类别的videoid--视频的唯一id越多就代表热度高, 类别排序的多少排序；不能分组分组是在组内排序)

①统计视频类别：

select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories

②按类别的热度排名

select t1.videoid, t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1

group by t1.categories order by num desc limit 10;

--->拼一块：t1.videoid不能出现在select后边，

select t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1

group by t1.categories order by num desc limit 10;

+----------------+---------+--+

| t1.categories | num |

+----------------+---------+--+

| Music | 179049 |

| Entertainment | 127674 |

| Comedy | 87818 |

| Animation | 73293 |

| Film | 73293 |

| Sports | 67329 |

| Gadgets | 59817 |

| Games | 59817 |

| Blogs | 48890 |

| People | 48890 |

+----------------+---------+--+

10 rows selected (70.01 seconds)

3.--统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数  //所有类别中包含Top20视频的个数

//Expression not in GROUP BY key 'videoid'

not in GROUP BY key 'views'，后边有views，select后必须加views

############

①观看数最高的20个视频:

select videoid, category, views from gulivideo_orc order by views desc limit

②把类别category炸开--所属类别

select videoid, categories, views from t1 lateral view explode(category) tbl categories

--->前两句合起：

select t1.videoid, categories, t1.views from (select videoid, category, views from gulivideo_orc order by views desc limit

) t1 lateral view explode(category) tbl as categories;

+--------------+----------------+-----------+--+

|  t1.videoid  |   categories   | t1.views  |

+--------------+----------------+-----------+--+

| dMH0bHeiRNg  | Comedy         |   |

| 0XxI-hvPRRA  | Comedy         |   |

| 1dmVU08zVpA  | Entertainment  |   |

| RB-wUgnyGv0  | Entertainment  |   |

| QjA5faZF1A8  | Music          |   |

| -_CSo1gOd48  | People         |   |

| -_CSo1gOd48  | Blogs          |   |

| 49IDp76kjPw  | Comedy         |   |

| tYnn51C3X_w  | Music          |   |

| pv5zWaTEVkI  | Music          |   |

| D2kJZOfq7zk  | People         |   |

| D2kJZOfq7zk  | Blogs          |   |

| vr3x_RRJdd4  | Entertainment  |   |

| lsO6D1rwrKc  | Entertainment  |   |

| 5P6UU6m3cqk  | Comedy         |   |

| 8bbTtPL1jRs  | Music          |    |

| _BuRwH59oAo  | Comedy         |    |

| aRNzWyD7C9o  | UNA            |    |

| UMf40daefsI  | Music          |    |

| ixsZy2425eY  | Entertainment  |    |

| MNxwAU_xAMk  | Comedy         |    |

| RUCZJVJ_M8o  | Entertainment  |    |

+--------------+----------------+-----------+--+

③类别中包含top20的视频的个数：在上条基础上加上按类别分组，计数组内videoid计数

--->

select categories, count(videoid) from (select videoid, category, views from gulivideo_orc order by views desc limit

) t1 lateral view explode(category) tbl as categories group by categories

+----------------+------+--+

|   categories   | _c1  |

+----------------+------+--+

| Blogs          |     |

| Comedy         |     |

| Entertainment  |     |

| Music          |     |

| People         |     |

| UNA            |     |

+----------------+------

-- over里边不能使用limit, 怎么获取分区排序前几个呢？需要使用一个子查询;分区是数据存储上的分子文件，查询时还是在一张表

select t1.videoid, t1.views, t1.ran, t1.categories from(

select videoid, views, categories, rank() over(partition by categories order by views desc) ran

from gulivideo_orc lateral view explode(category) tbl as categories) t1

where t1.ran <= ;

+--------------+-----------+---------+----------------+--+

|  t1.videoid  | t1.views  | t1.ran  | t1.categories  |

+--------------+-----------+---------+----------------+--+

| 2GWPOPSXGYI  |    |        | Animals        |

| xmsV9R8FsDA  |    |        | Animals        |

| 12PsUW-8ge4  |    |        | Animals        |

| OeNggIGSKH8  |    |        | Animals        |

| WofFb_eOxxA  |    |        | Animals        |

| sdUUx5FdySs  |    |        | Animation      |

| 6B26asyGKDo  |    |        | Animation      |

| H20dhY01Xjk  |    |        | Animation      |

| 55YYaJIrmzo  |    |        | Animation      |

| JzqumbhfxRo  |    |        | Animation      |

| RjrEQaG5jPM  |    |        | Autos

......

.--统计视频观看数Top50所关联视频的所属类别排序
Top50---relatedid---种类---; 炸开之后直接join，因它是张虚拟表，hive是不支持的
select videoid, views, relatedid from gulivideo_orc order by views desc limit 50
炸开单独写一个sql： t1 select distinct(tbl.relatedids) rid from t1 lateral view explode(relatedid) tbl as relatedids
自己join自己下： t2 select g.videoid, g.category from t2 left join gulivideo_orc g on t2.vid=g.videoid
把category炸开并排序：select cateegories, count(videoid) hot from t3 lateral view explode(category) tb12 as catogories group by categores order by hot desc;

select categories, count(videoid) hot

from(select g.videoid, g.category

from(select distinct(tbl.relatedids) rid

from(select videoid, views, relatedid

from gulivideo_orc order by views desc limit ) t1 lateral view explode(relatedid) tbl as relatedids) t2

join gulivideo_orc g on t2.rid=g.videoid) t3 lateral view explode(category) tbl2 as categories

group by categories order by hot desc;

+----------------+------+--+

|   categories   | hot  |

+----------------+------+--+

| Comedy         |   |

| Entertainment  |   |

| Music          |   |

| Blogs          |    |

| People         |    |

| Film           |    |

| Animation      |    |

| News           |    |

| Politics       |    |

| Games          |    |

| Gadgets        |    |

| Sports         |    |

| Places         |    |

| UNA            |    |

| Travel         |    |

| Howto          |    |

| DIY            |    |

| Animals        |    |

| Pets           |    |

| Autos          |     |

| Vehicles       |     |

+----------------+------+--+

 rows selected (115.239 seconds)

.--统计每个类别中的视频热度Top10,以Music为例

创建类别表：

create table gulivideo_category(

videoid string, uploader string, age int, categoryid string, length int, views int, rate float,

ratings int, comments int, relatedid array<string>)

row format delimited fields terminated by "\t"

collection items terminated by "&"

stored as orc;

插入数据：

insert into table gulivideo_category

select videoid, uploader, age, categoryid, length, views, rate, ratings, comments, relatedid

from gulivideo_orc lateral view explode(category) category as categoryid;

--->把一张表全查出来：
select categoryid, videoid, paiming from (
select categoryid, videoid, rank() over(partition by categoryid order by views desc) paiming from gulivideo_category) t1
where t1.paiming <= 10;

select categoryid, videoid, views

from gulivideo_category

where categoryid="music"

order by views desc limit 10;

.--统计每个类别中视频流量Top10,以Music为例

select videoid, ratings

from gulivideo_category

where categoryid="music"

order by ratings desc limit ;

.--统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

①上传视频最多的用户Top10:

select videos,uploader

from gulivideo_user_orc

order by videos desc limit ;

②找出这10个人上传的视频
select g.videoid, rank() over(partition by g.uploader order by g.views desc) hot from t1 join gulivideo_orc g on t1.uploader = g.uploader
③找出前20
select t2.uploader, t2.videoid from t2 where t2.hot <= 20;

select t2.uploader, t2.videoid from(

select g.uploader, g.videoid, g.views, rank() over(partition by g.uploader order by g.views desc) hot from

(select uploader,videos

from gulivideo_user_orc

order by videos desc limit ) t1

left join gulivideo_orc g on t1.uploader=g.uploader) t2

where t2.hot <= ;

 +----------------+--------------+--+

|  t2.uploader   |  t2.videoid  |

+----------------+--------------+--+

| NULL           | NULL         |

| NULL           | NULL         |

| NULL           | NULL         |

| NULL           | NULL         |

| Ruchaneewan    | xbYyjUdhtJw  |

| Ruchaneewan    | 4dkKeIUkN7E  |

| Ruchaneewan    | qCfuQA6N4K0  |

| Ruchaneewan    | TmYbGQaRcNM  |

| Ruchaneewan    | dOlfPsFSjw0  |

| expertvillage  | -IxHBW0YpZw  |

| expertvillage  | BU-fT5XI_8I  |

| expertvillage  | ADOcaBYbMl0  |

...

.--统计每个类别视频观看数Top10

select t.categoryid, t.videoid, t.ranking

from(

select categoryid, videoid, rank() over(partition by categoryid order by views desc) ranking

from gulivideo_category) t

where t.ranking <= ;

+----------------+--------------+------------+--+

|  t.categoryid  |  t.videoid   | t.ranking  |

+----------------+--------------+------------+--+

| Animals        | 2GWPOPSXGYI  |           |

| Animals        | xmsV9R8FsDA  |           |

| Animals        | 12PsUW-8ge4  |           |

| Animals        | OeNggIGSKH8  |           |

| Animals        | WofFb_eOxxA  |           |

| Animals        | AgEmZ39EtFk  |           |

| Animals        | a-gW3RbJd8U  |           |

| Animals        | 8CL2hetqpfg  |           |

| Animals        | QmroaYVD_so  |           |

| Animals        | Sg9x5mUjbH8  |          |

| Animation      | sdUUx5FdySs  |           |

| Animation      | 6B26asyGKDo  |           |

| Animation      | H20dhY01Xjk  |           |

| Animation      | 55YYaJIrmzo  |           |

| Animation      | JzqumbhfxRo  |           |

| Animation      | eAhfZUZiwSE  |           |

| Animation      | h7svw0m-wO0  |           |

| Animation      | tAq3hWBlalU  |           |

| Animation      | AJzU3NjDikY  |           |

| Animation      | ElrldD02if0  |          |

| Autos          | RjrEQaG5jPM  |           |

......

 rows selected (24.379 seconds)

1.分组TOPN选出今年每个学校,每个年级,每个科目分数前三.

: 时间,学校,年级,姓名,科目,成绩

建表

create external table score_test(school string, grade string, name string, subject string, score int)

partitioned by (year string)

row format delimited fields terminated by ','

stored as textfile

location '/hive_data';

stored as textfile  ##把它放后边报错

select t1.name, t1.subject, t1.ran from(select name, subject, row_number() over(partition by school, grade, subject order by score desc) ran

from score_test where year="2013") t1 where t1.ran <= 3;

2. 今年清华 1年级总成绩大于200分的学生以及学生数 ||多个字段的group by,还要按name分

select school, grade, name, sum(score) sum_score, count(1) over() num from score_test

where year = "2013" and school="清华" and grade="1"

group by school, grade, name having sum_score > 200;

3.

CREATE TABLE transaction_details (cust_id INT, amount FLOAT, month STRING, country STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ ;

建按月份的分区表:

create table transaction_details(cust_id int, amount float, month string, country string)

partitioned by(month string)

row format delimited fields terminated by ',';

每个月的总收入:

select cust_id, sum(amount) over(partition by month) as total from transaction_details;

4. 将内部表a，转换成外部表：

alter tale a set tblproperties ('external'='true');

5.订单详情表ord_det(order_id订单号，sku_id商品编号，sale_qtty销售数量，dt日期分区)

任务计算2016年1月1日商品销量的Top100，并按销量降级排序

select order_id, sale_qtty from ord_det

where dt = "20160101" order by sale_qtty desc limit 100;

STG.ORDER，有如下字段:Date，Order_id，User_id，amount。请给出sql进行统计:

数据样例:2017-01-01,10029028,1000003251,33.57

1) 给出2017年每个月的订单数、用户数、总成交金额

一个分区中的数据肯定很大，不要用distinct，用group by user_id做一个子查询再count(user_id)

select count(user_id) from (select user_id from stg.order group by user_id);

select count(order_id) order_count, count(distinct(user_id)) user_count, sum(amount) all, substring(date, 1, 7) month from stg.order

where substring(date, 1, 4)=''

group by month;

2) 给出2017年11月的新客数(指在11月才有第一笔订单)。

select count(1) from

(select order_id, lag(date, 1) over(partition by user_id order by date) fistOrder from stg.order) t1

where substring(date, 1, 7) = '2017-11' and fistOrder is null;

蚂蚁森林植物申领统计

===================================================================

表1:user_low_carbon表记录了用户每天的蚂蚁森林低碳生活领取的记录流水

user_id（int）   data_dt（string）     low_carbon

用户             日期                  减少碳排放（g）

数据样例：

user_id  |   date_dt   |  low_carbon

————————————————————

u_001    |   2017/1/1  |  10

u_001    |   2017/1/2  |  150

u_001    |   2017/1/2  |  110

u_001    |   2017/1/2  |  10

u_001    |   2017/1/4  |  50

u_001    |   2017/1/4  |  10

u_001    |   2017/1/6  |  45

u_001    |   2017/1/6  |  90

u_002    |   2017/1/1  |  10

u_002    |   2017/1/2  |  150

u_002    |   2017/1/2  |  70

u_002    |   2017/1/3  |  30

u_002    |   2017/1/3  |  80

u_002    |   2017/1/4  |  150

u_002    |   2017/1/5  |  101

u_002    |   2017/1/6  |  68

================================================================

表2:plant_carbon表，用于记录申领环保植物所需要减少的碳排放量

plant_id（int）    plant_name    low_carbon

植物编号            植物名        换购植物所需要的碳

数据样例：

plant_id  |  plant_name  |  plant_carbon

————————————————————

p001      |  梭梭树      |  17

p002      |  沙柳        |  19

p003      |  樟子树      |  146

p004      |  胡杨        |  215

================================================================

题目一

蚂蚁森林植物申领统计

问题：假设2017年1月1日开始记录低碳数据（user_low_carbon），假设2017年10月1日之前满足申领条件的用户都申领了一颗“p004-胡杨”，

剩余的能量全部用来领取“p002-沙柳”。

统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息；以及他比后一名多领了几颗沙柳。

得到的统计结果如下表样式：

user_id  plant_count less_count(比后一名多领了几颗沙柳)

u_101    1000         100

u_088    900          400

u_103    500          …

1.累计能量排名前10的用户信息,取日期在10月1日之前的|按用户id分组|总能量排序|取-过滤前11：

t1

select user_id, sum(low_carbon) sum_carbon from user_low_carbon

where datediff(regexp_replace("2017/10/1", "/", "-"), regexp_replace(date_dt, "/", "-")) > 0

group by user_id order by sum_carbon desc limit 11;

2.胡杨的能量

t2

select plant_carbon huyang from plant_carbon where plant_name = "胡杨";

3.杨柳的能量

t3

select plant_carbon yangliu from plant_carbon where plant_name = "杨柳";

4.能领取的杨柳个数num

t4

select floor((sum_carbon-huyang)/yangliu) plant_count from t1, t2, t3;

5.他比后一名的人多领取的数，用lead往后第n行数据,把它们做比较的放在同1行;

t5

select lead(sum_carbon, 1, 0) over(sort by sum_carbon desc) plant_count2 from t4;

6.做比较

select user_id,plant_count, (plant_count - plant_count2) less_count from t5 limit10;

=================================================================

题目二

蚂蚁森林低碳用户排名分析

问题：查询user_low_carbon表中每日流水记录，条件为：

用户在2017年，连续三天（或以上）的天数里，

每天减少碳排放（low_carbon）都超过100g的用户低碳流水。

需要查询返回满足以上条件的user_low_carbon表中的记录流水。

例如用户u_002符合条件的记录如下，因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g：

seq（key） user_id data_dt  low_carbon

xxxxx10    u_002  2017/1/2  150

xxxxx11    u_002  2017/1/2  70

xxxxx12    u_002  2017/1/3  30

xxxxx13    u_002  2017/1/3  80

xxxxx14    u_002  2017/1/4  150

xxxxx14    u_002  2017/1/5  101

1.过滤用户在2007年中，碳排放量超过100g能量的用户;

按在2007年的| 用户id、日期(因为不同行有可能是一个日期)进行分组| 选择每天的能量>100的;

t1

select user_id, date_dt, sum(low_carbon) sum_day from user_low_carbon where substring(data_dt, 1, 4) year = "2017"

group by user_id, data_dt order by user_id, data_dt having sum_day > 100

2.每条数据的日期以及前两条和后两条数据的日期

t2

select user_id,

data_dt,

lag(data_dt, 2, "2000/1/1") over(partition by user_id) lag_2,

lag(data_dt, 1, "2000/1/1") over(partition by user_id) lag_1,

lead(data_dt, 2, "2000/1/1") over(partition by user_id) lead_2,

lead(data_dt, 1, "2000/1/1") over(partition by user_id) lead_1

from t1;

3.计算当前日期与前后两条数据的日期差

t3:

select user_id, data_dt

datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_2", "/", "-")) lag2,

datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_1", "/", "-")) lag1,

datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_2"), "/", "-") lead2,

datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_1", "/", "-")) lead1

from t2;

4.连续3天有三种情况：

    ①当前日期和前一天日期差为1，当前日期和前两天的日期差为2;

    ②当前日期和前一天日期差为1，当前日期和后一天的日期差为-1;

    ③当前日期和后一天的日期差为-1，当前日期和后两天的日期差为-2;

t4:

select user_id, data_dt from t3

where (lag1=1 and lag2=2) or (lag1=1 and lead1=-1) or (lead1=-1 and lead2=-2);

5.最后的结果

select t5.user_id, t5.data_dt, t5.low_carbon from user_low_carbon t5

inner join t4 on t4.user_id = t5.user_id

where t4.user_id = t5.user_id and t4.data_dt = t5.date_dt;

======================================================

注：

涉及到的hive函数

====================================

1:regexp_replace（arg1,arg2,arg3）

    arg1:被替换字符串的正则表达式

    arg2:被替换的字符

    arg3:被换成的字符

e.g. ：regexp_replace("2017/1/4","/","-")=2017-1-4

=====================================

2:datediff(arg1,arg2)

    arg1:日期1

    arg2:日期2

e.g.：datediff("2017-1-6","2017-1-5")=1

Hive| ETL清洗& 查询练习的更多相关文章

hive php连接查询
baidu hive php PHP连接Hive执行sql查询 php通过 thrift访问hadoop的hive php开发Hive Web查询 php连接hive执行sql查询利用python将 ...
hive的join查询
hive的join查询语法 join_table: table_reference [INNER] JOIN table_factor [join_condition] | table_refere ...
Hive[6] HiveQL 查询
6.1 SELECT ... FROM 语句 hive> SELECT name,salary FROM employees; --普通查询 hive>SELECT e.n ...
Hive之数据查询
Hive之数据查询发布于:2013 年 10 月 11 日由 Aaron发布于: Hive 一,排序和聚合对于排序有两种方式,一种是order by 一种是sort by order by 会对 ...
hive 将hive表数据查询出来转为json对象和json数组输出
一.将hive表数据查询出来转为json对象输出 1.将查询出来的数据转为一行一行,并指定分割符的数据 2.使用UDF函数,将每一行数据作为string传入UDF函数中转换为json再返回 1.准备数 ...
[转]Hive：简单查询不启用Mapreduce job而启用Fetch task
转自:http://www.iteblog.com/archives/831 如果你想查询某个表的某一列,Hive默认是会启用MapReduce Job来完成这个任务,如下: hive> SEL ...
hive的高级查询（group by、 order by、 join 、 distribute by、sort by、 clusrer by、 union all等）
查询操作 group by. order by. join . distribute by. sort by. clusrer by. union all 底层的实现 mapreduce 常见的聚合操 ...
kylin对接hive实现实时查询
前提: 安装kylin之前,需要安装hadoop2.0.hbase.hive,并且对版本有要求,可以参照官网链接 http://kylin.apache.org/cn/docs/install/in ...
python操作hive并且获取查询结果scheam
执行hive -e 命令并且获取对应的select查询出来的值及其对应的scheam字段需要在执行语句中前部添加 set hive.cli.print.header=true; 这个设置,如下语句: ...

随机推荐

浅谈深度优先和广度优先(scrapy-redis)
首先先谈谈深度优先和广度优先的定义深度优先搜索算法(英语:Depth-First-Search,DFS)是一种用于遍历或搜索树或图的算法.沿着树的深度遍历树的节点,尽可能深的搜索树的分支.当节点v的 ...
Python- 索引 B+数比如书的目录
1.索引为何要有索引? 一般的应用系统,读写比例在10:1左右,而且插入操作和一般的更新操作很少出现性能问题, 在生产环境中,我们遇到最多的,也是最容易出问题的,还是一些复杂的查询操作, 因此对查询 ...
监听 input上传文件，获取文件名称，
<div class="import-box pr" > <span class="model-address-txt">导入文件:&l ...
v-html里面添加样式
项目中,使用V-html渲染的页面,要添加样式,改如何做解决方案1:在updated生命周期函数中,js动态配置样式,代码如下: updated() { $('.msgHtmlBox').find( ...
Springboot 事务处理常见坑点
使用事务注解@Transactional 之前,应该先了解它的相关属性,避免在实际项目中踩中各种各样的坑点. 常见坑点1:遇到非检测异常时,事务不开启,也无法回滚. 例如下面这段代码,账户余额依旧增加 ...
PID控制器开发笔记之三：抗积分饱和PID控制器的实现
积分作用的引入是为了消除系统的静差,提高控制精度.但是如果一个系统总是存在统一个方向的偏差,就可能无限累加而进而饱和,极大影响系统性能.抗积分饱和就是用以解决这一问题的方法之一.这一节我们就来实现抗积 ...
django 中自定义过滤器
多参数过滤器
动态获取后台传过来的值作为select选项
问题描述:点击左侧菜单项,进入对应的具体页面a.html,页面上方有个select框,点击框后,会浮现选择项. 解决思路:对左侧菜单项添加一个onclick事件,进入后台做具体的查询,将查询到的lis ...
laravel 配置设置
public function updateRegisterSetting(Request $request, Configuration $config) { $conf = $request-&g ...
js中onload和jQuery中的ready区别
window.onload必须等到页面内包括图片的所有元素加载完毕后才能执行. ------>不能写多个(如果有多个,只会执行一个) $(document).ready()是DOM结构绘制完毕后 ...

Hive| ETL清洗& 查询练习

ETL清洗数据

Hive| ETL清洗& 查询练习的更多相关文章

随机推荐

热门专题