hive分区分桶

1、分区
- 1.1、静态分区
  - 1.1.1、一个分区
  - 1.1.2、多个分区
- 1.2、动态分区
2、分桶

1、分区

如果一个表中数据很多，我们查询时就很慢，耗费大量时间，如果要查询其中部分数据该怎么办呢，这时我们引入分区的概念。

Hive 中的分区表分为两种：静态分区和动态分区。

1.1、静态分区

可以根据 PARTITIONED BY 创建分区表。

一个表可以拥有一个或者多个分区，每个分区以文件夹的形式单独存在表文件夹的目录下。

分区列是虚拟列，它们不是数据本身的一部分，而是在加载时派生的。

1.1.1、一个分区

-- 建表

create table order_table_s

(

    order_id int,         -- 订单id

    product_name string,  -- 产品名称

    price int             -- 产品价格

)

partitioned by (deal_day string)  -- 交易日期YYYYMM

row format delimited

fields terminated by "\t";

-- 查看表结构

hive> desc order_table_s;

OK

order_id                int

product_name            string

price                   int

deal_day                string                                      

# Partition Information

# col_name              data_type               comment

deal_day                string  

-- 源数据

-- order-201901.txt

1	cellphone	2000

2	tv	3000

3	sofa	8000

4	cabinet	5000

5	bicycle	1000

6	truck	20000

-- order-201902.txt

1	apple	10

2	banana	8

3	milk	70

4	liquor	150

-- 导入数据

load data local inpath '/root/data/order-201901.txt' overwrite into table order_table_s partition(deal_day='201901');

load data local inpath '/root/data/order-201902.txt' overwrite into table order_table_s partition(deal_day='201902');

-- 查看表

hive> select * from order_table_s;

1       cellphone       2000    201901

2       tv      3000    201901

3       sofa    8000    201901

4       cabinet 5000    201901

5       bicycle 1000    201901

6       truck   20000   201901

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

-- 查看201902的数据

hive> select * from order_table_s where deal_day='201902';

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

-- 查看hdfs目录

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_s

Found 2 items

drwxr-xr-x   - root supergroup          0 2021-01-08 16:05 /user/hive/warehouse/order_table_s/deal_day=201901

drwxr-xr-x   - root supergroup          0 2021-01-08 16:27 /user/hive/warehouse/order_table_s/deal_day=201902

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_s/deal_day=201901

Found 1 items

-rw-r--r--   1 root supergroup         56 2021-01-08 18:23 /user/hive/warehouse/order_table_s/deal_day=201901/000000_0

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_s/deal_day=201901

Found 1 items

-rw-r--r--   1 root supergroup         83 2021-01-08 16:05 /user/hive/warehouse/order_table_s/deal_day=201901/order-201901.txt

-- 查看mysql中的元数据

mysql> select * from PARTITION_KEY_VALS;

+---------+--------------+-------------+

| PART_ID | PART_KEY_VAL | INTEGER_IDX |

+---------+--------------+-------------+

|      16 | 201901       |           0 |

|      21 | 201902       |           0 |

+---------+--------------+-------------+

-- 删除分区201902

-- ALTER TABLE table_name DROP partition_spec, partition_spec,...

hive> alter table order_table_s drop partition(deal_day='201902');

-- 删除后，查看表

hive> select * from order_table_s;

1       cellphone       2000    201901

2       tv      3000    201901

3       sofa    8000    201901

4       cabinet 5000    201901

5       bicycle 1000    201901

6       truck   20000   201901

-- 删除后，查看hdfs目录

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_s

Found 1 items

drwxr-xr-x   - root supergroup          0 2021-01-08 16:05 /user/hive/warehouse/order_table_s/deal_day=201901

-- 删除后，查看mysql中的元数据

mysql> select * from PARTITION_KEY_VALS;

+---------+--------------+-------------+

| PART_ID | PART_KEY_VAL | INTEGER_IDX |

+---------+--------------+-------------+

|      16 | 201901       |           0 |

+---------+--------------+-------------+

-- 删除后，数据和元数据都被删除了

-- ---------------------------------------------------

-- 重新添加分区201902

-- 这里的location是这个分区的数据存放的位置，默认是`/user/hive/warehouse/order_table_s`

alter table order_table_s add partition(deal_day='201902') location '/in/order';

-- 把order-201902.txt移动到'/in/order'目录下

-- 所以导入数据，也可以直接复制到对应的目录下。

[root@zgg data]# hadoop fs -mv /in/order-201902.txt /in/order

-- 查看201902的数据

hive> select * from order_table_s where deal_day='201902';

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

-- hive中查询分区

hive> show partitions order_table_s;

deal_day=201901

deal_day=201902

-- 如果直接将数据复制到分区目录下，select没有数据的话，可以执行`msck repair table order_table_s` 重新同步hdfs上的分区信息。

1.1.2、多个分区

-- 建表:两个分区

create table order_table_d

(

    order_id int,         -- 订单id

    product_name string,  -- 产品名称

    price int             -- 产品价格

)

partitioned by (deal_day string,category string)  -- 交易日期YYYYMM，产品类别

row format delimited

fields terminated by "\t";

-- 查看表结构

hive> desc order_table_d;

OK

order_id                int

product_name            string

price                   int

deal_day                string

category                string                                      

# Partition Information

# col_name              data_type               comment

deal_day                string

category                string           

-- 源数据

-- order-201901-electronicproducts.txt

1	cellphone	2000

2	tv	3000

-- order-201901-car.txt

1	bicycle	1000

2	truck	20000

-- order-201902-fruit.txt

1	apple	10

2	banana	8

-- order-201902-drinks.txt

1	milk	70

2	liquor	150

-- 导入数据

load data local inpath '/root/data/order-201901-electronicproducts.txt' overwrite into table order_table_d partition(deal_day='201901',category='electronicproducts');

load data local inpath '/root/data/order-201901-car.txt' overwrite into table order_table_d partition(deal_day='201901',category='car');

load data local inpath '/root/data/order-201902-fruit.txt' overwrite into table order_table_d partition(deal_day='201902',category='fruit');

load data local inpath '/root/data/order-201902-drinks.txt' overwrite into table order_table_d partition(deal_day='201902',category='drinks');

-- 查看

hive> select * from order_table_d;

OK

1       bicycle 1000    201901  car

2       truck   20000   201901  car

1       cellphone       2000    201901  electronicproducts

2       tv      3000    201901  electronicproducts

1       milk    70      201902  drinks

2       liquor  150     201902  drinks

1       apple   10      201902  fruit

2       banana  8       201902  fruit

hive> show partitions order_table_d;

OK

deal_day=201901/category=car

deal_day=201901/category=electronicproducts

deal_day=201902/category=drinks

deal_day=201902/category=fruit

-- 查看hdfs目录

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_d

Found 2 items

drwxr-xr-x   - root supergroup          0 2021-01-08 17:00 /user/hive/warehouse/order_table_d/deal_day=201901

drwxr-xr-x   - root supergroup          0 2021-01-08 17:00 /user/hive/warehouse/order_table_d/deal_day=201902

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_d/deal_day=201901

Found 2 items

drwxr-xr-x   - root supergroup          0 2021-01-08 17:00 /user/hive/warehouse/order_table_d/deal_day=201901/category=car

drwxr-xr-x   - root supergroup          0 2021-01-08 17:00 /user/hive/warehouse/order_table_d/deal_day=201901/category=electronicproducts

-- 查看mysql中的元数据

mysql> select * from PARTITION_KEY_VALS;

+---------+--------------------+-------------+

| PART_ID | PART_KEY_VAL       | INTEGER_IDX |

+---------+--------------------+-------------+

|      23 | 201901             |           0 |

|      23 | electronicproducts |           1 |

|      24 | 201901             |           0 |

|      24 | car                |           1 |

|      25 | 201902             |           0 |

|      25 | fruit              |           1 |

|      26 | 201902             |           0 |

|      26 | drinks             |           1 |

+---------+--------------------+-------------+

-- 删除分区deal_day=201902/category=fruit

hive> alter table order_table_d drop partition(deal_day='201902',category='fruit');

-- 删除后，查看表分区

hive> show partitions order_table_d;

OK

deal_day=201901/category=car

deal_day=201901/category=electronicproducts

deal_day=201902/category=drinks

-- 删除后，查看hdfs目录

[root@zgg data]# hadoop fs -ls /user/hive/warehouse/order_table_d/deal_day=201902

Found 1 items

drwxr-xr-x   - root supergroup          0 2021-01-08 17:00 /user/hive/warehouse/order_table_d/deal_day=201902/category=drinks

-- 删除后，查看mysql中的元数据

mysql> select * from PARTITION_KEY_VALS;

+---------+--------------------+-------------+

| PART_ID | PART_KEY_VAL       | INTEGER_IDX |

+---------+--------------------+-------------+

|      23 | 201901             |           0 |

|      23 | electronicproducts |           1 |

|      24 | 201901             |           0 |

|      24 | car                |           1 |

|      26 | 201902             |           0 |

|      26 | drinks             |           1 |

+---------+--------------------+-------------+

-- 重新添加分区deal_day=201902/category=fruit

hive> alter table order_table_d add partition(deal_day='201902',category='fruit');

-- 重新导入

hive> load data local inpath '/root/data/order-201902-drinks.txt' overwrite into table order_table_d partition(deal_day='201902',category='drinks');

-- 查看

hive> select * from order_table_d;

1       bicycle 1000    201901  car

2       truck   20000   201901  car

1       cellphone       2000    201901  electronicproducts

2       tv      3000    201901  electronicproducts

1       milk    70      201902  drinks

2       liquor  150     201902  drinks

hive> show partitions order_table_d;

deal_day=201901/category=car

deal_day=201901/category=electronicproducts

deal_day=201902/category=drinks

deal_day=201902/category=fruit

-- 导入数据到分区表，还可以使用 insert

-- 将order_table_d的201901分区中的数据插入到order_table_s的201901分区中

-- 查看order_table_s数据

hive> select * from order_table_s;

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

hive> insert into table order_table_s partition(deal_day='201901') select order_id,product_name,price from order_table_d where deal_day='201901';

-- 查看结果

hive> select * from order_table_s where deal_day='201901';

1       bicycle 1000    201901

2       truck   20000   201901

1       cellphone       2000    201901

2       tv      3000    201901

1.2、动态分区

上面展示了如何使用 insert 插入到分区表，如果再插入分区 '201902' 数据，需要再写一条 insert 语句。

而动态分区可以直接使用一条 insert 语句完成。

下面利用动态分区进行演示。

演示前先进行设置：

hive 中默认是静态分区，想要使用动态分区，需要设置如下参数，可以使用临时设置，也可以写在配置文件（hive-site.xml）里，永久生效。临时配置如下:

//开启动态分区 默认为false，不开启

set hive.exec.dynamic.partition=true;

//指定动态分区模式，默认为strict，即必须指定至少一个分区为静态分区，

//nonstrict模式表示允许所有的分区字段都可以使用动态分区

set hive.exec.dynamic.partition.mode=nonstrict;

-- 创建动态分区表

create table order_table_dy

(

    order_id int,         -- 订单id

    product_name string,  -- 产品名称

    price int             -- 产品价格

)

partitioned by (deal_day string)  -- 交易日期YYYYMM，产品类别

row format delimited

fields terminated by "\t";

hive> select * from order_table_s;

1       bicycle 1000    201901

2       truck   20000   201901

1       cellphone       2000    201901

2       tv      3000    201901

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

-- 将order_table_s表里的两个分区的数据插入到order_table_dy表

hive> insert into table order_table_dy(deal_day) select order_id,product_name,price,deal_day from order_table_s;

-- 查看结果

hive> select * from order_table_dy where deal_day=201901;

OK

1       bicycle 1000    201901

2       truck   20000   201901

1       cellphone       2000    201901

2       tv      3000    201901

hive> select * from order_table_dy where deal_day=201902;

OK

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

hive> show partitions order_table_dy;

OK

deal_day=201901

deal_day=201902

静态分区(SP)列:在涉及多个分区列的 DML/DDL 中，这些列的值在编译时已知(由用户给出)。

动态分区(DP)列:在执行时才知道其值的列。

DP 列的指定方式与 SP 列的指定方式相同：在 partition 子句中。唯一的区别是 DP 列没有值，而 SP 列有。在 partition 子句中，我们需要指定所有分区列，即使它们都是 DP 列。

在 INSERT ... SELECT ... 查询时，动态分区列必须在 SELECT 语句中的最后一个列中指定，且顺序与它们在 PARTITION() 子句中出现的顺序相同。

-- 正确

INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr)

SELECT key, value, /*ds,*/ hr FROM srcpart WHERE ds is not null and hr>10;

-- 错误

INSERT OVERWRITE TABLE T PARTITION (ds, hr = 11)

SELECT key, value, ds/*, hr*/ FROM srcpart WHERE ds is not null and hr=11;

2、分桶

通过计算表的某些列的哈希值，分区中的数据再被划分到桶中。

例如，page_views 表根据 userid 分桶，userid 是 page_view 表的列之一，而不是分区列。

将表或分区组织成桶有以下几个目的：

(1)抽样更高效，因为在处理大规模的数据集时，在开发、测试阶段将所有的数据全部处理一遍可能不太现实，这时抽样就必不可少。

抽样：

    SELECT * FROM table_name TABLESAMPLE(n PERCENT)；

    抽样出n%的数据，会全表扫描。

有了桶之后呢？

    如果在 TABLESAMPLE 子句中指定的列与 CLUSTERED BY 子句中的列相匹配，则 TABLESAMPLE 只扫描表中要求的哈希分区【就是具体的桶】

    SELECT * FROM film TABLESAMPLE(BUCKET x OUTOF y)

        如果表是用 `CLUSTERED BY id INTO 32 BUCKETS` 创建的。

        TABLESAMPLE(BUCKET 3 OUT OF 16 ON id)：

        将挑选出第 3 和第 19 个聚类，因为每个桶将由 (32/16)=2 个聚类组成。

        【每个桶有2个聚类，一共64个聚类，取出第 3 和第 19 个聚类】

        TABLESAMPLE(BUCKET 3 OUT OF 64 ON id)：

        将挑选出第 3 个聚类的一半，因为每个桶将由 (32/64)=1/2 个聚类组成

(2)更好的查询处理效率。

大表在JOIN的时候，效率低下。如果对两个表先分别按id分桶，那么相同id都会归入一个桶。

那么此时再进行JOIN的时候是按照桶来JOIN的，那么大大减少了JOIN的数量。

在建立桶之前，需要设置 set hive.enforce.bucketing=true;，使得 hive 能识别桶。【仅版本 0.x 和 1.x，对于2.x版本不再需要】

hive> select * from order_table_s;

OK

1       bicycle 1000    201901

2       truck   20000   201901

1       cellphone       2000    201901

2       tv      3000    201901

1       apple   10      201902

2       banana  8       201902

3       milk    70      201902

4       liquor  150     201902

-- 创建分桶

create table order_table_buckets

(

    order_id int,         -- 订单id

    product_name string,  -- 产品名称

    price int             -- 产品价格

)

partitioned by (deal_day string)  -- 交易日期YYYYMM，产品类别

clustered by (order_id) into 4 buckets

row format delimited

fields terminated by "\t";

-- 导入数据：

hive> insert into table order_table_buckets partition(deal_day) select order_id,product_name,price,deal_day from order_table_s where order_id=1;

-- 查看表

hive> select * from order_table_buckets;

OK

1       cellphone       2000    201901

1       bicycle 1000    201901

1       apple   10      201902

-- 查看hdfs中的目录

[root@zgg ~]# hadoop fs -ls /user/hive/warehouse/order_table_buckets

Found 2 items

drwxr-xr-x   - root supergroup          0 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201901

drwxr-xr-x   - root supergroup          0 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201902

[root@zgg ~]# hadoop fs -ls /user/hive/warehouse/order_table_buckets/deal_day=201901

Found 4 items

-rw-r--r--   1 root supergroup          0 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201901/000000_0

-rw-r--r--   1 root supergroup         32 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201901/000001_0

-rw-r--r--   1 root supergroup          0 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201901/000002_0

-rw-r--r--   1 root supergroup          0 2021-01-08 18:49 /user/hive/warehouse/order_table_buckets/deal_day=201901/000003_0

参考：

https://blog.51cto.com/10814168/2135046