Hive之Order,Sort,Cluster and Distribute By
- 测试数据
create table sort_test(
id int,
name string
)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile;
[root@wadeyu ~]# cat sort_test.log
4679 aaa
4728 aaa
3040 aaa
4207 aaa
2231 aaa
1279 aaa
7954 aaa
582 aaa
7096 aaa
4878 aaa
9684 aaa
1540 aaa
4826 aaa
2543 aaa
2323 aaa
1420 aaa
5083 aaa
8965 aaa
1391 aaa
9719 aaa
9901 aaa
2393 aaa
6024 aaa
444 aaa
1574 aaa
8881 aaa
5739 aaa
8689 aaa
1614 aaa
9340 aaa
6726 aaa
109 aaa
6941 aaa
9562 aaa
9019 aaa
4945 aaa
2206 aaa
5910 aaa
8552 aaa
1795 aaa
2720 aaa
9007 aaa
8377 aaa
2179 aaa
3683 aaa
5869 aaa
5448 aaa
5223 aaa
5127 aaa
4616 aaa
2340 aaa
1268 aaa
4332 aaa
2989 aaa
19 aaa
7880 aaa
505 aaa
5975 aaa
5288 aaa
5682 aaa
376 aaa
7502 aaa
6448 aaa
3774 aaa
5541 aaa
9636 aaa
2037 aaa
246 aaa
6151 aaa
7837 aaa
1506 aaa
3749 aaa
9335 aaa
3973 aaa
5160 aaa
7929 aaa
834 aaa
3451 aaa
1766 aaa
6228 aaa
8961 aaa
8177 aaa
2340 aaa
4245 aaa
3226 aaa
2670 aaa
784 aaa
7699 aaa
2054 aaa
6006 aaa
4204 aaa
8905 aaa
6182 aaa
1271 aaa
5415 aaa
5164 aaa
4320 aaa
3736 aaa
2287 aaa
6559 aaa
- Order By
- Job中只会启动一个reduce做全局排序,数据量大时,耗时会很久
- 在strict模式(hive.mapred.mode=strict)下,必须添加limit语句限制返回条数
# 语法格式
colOrder: ( ASC | DESC )
colNullOrder: (NULLS FIRST | NULLS LAST) -- (Note: Available in Hive 2.1.0 and later)
orderBy: ORDER BY colName colOrder? colNullOrder? (',' colName colOrder? colNullOrder?)*
query: SELECT expression (',' expression)* FROM src orderBy
# 排序
select * from sort_test order by id desc;
+---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9719 | aaa |
| 9684 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9340 | aaa |
| 9335 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8377 | aaa |
| 8177 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5223 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5127 | aaa |
| 5083 | aaa |
| 4945 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4728 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3736 | aaa |
| 3683 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2179 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1795 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1574 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 505 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 109 | aaa |
| 19 | aaa |
+---------------+-----------------+--+
- Sort By
- 排序前会根据排序字段分区,一个job启动多个reduce进行局部排序
- 如果有limit语句,会再次启动一个job,取出每个局部排好序的前n条,再进行全局排序
- 只保证局部有序,不保证全局有序
# Sort By语法
colOrder: ( ASC | DESC )
sortBy: SORT BY colName colOrder? (',' colName colOrder?)*
query: SELECT expression (',' expression)* FROM src sortBy
# 设置开启的reduce个数
set mapreduce.job.reduces=2;
0: jdbc:hive2://> set mapreduce.job.reduces;
+--------------------------+--+
| set |
+--------------------------+--+
| mapreduce.job.reduces=2 |
+--------------------------+--+
# 执行局部排序(未带limit)
0: jdbc:hive2://> select * from sort_test sort by id desc;
+---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9684 | aaa |
| 9340 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8177 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5083 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 3736 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2340 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 19 | aaa |
| 9719 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9335 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8377 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 5223 | aaa |
| 5127 | aaa |
| 4945 | aaa |
| 4728 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3683 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2179 | aaa |
| 1795 | aaa |
| 1574 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 505 | aaa |
| 109 | aaa |
+---------------+-----------------+--+
# 带limit排序(会额外再启动一个job进行全局排序)
0: jdbc:hive2://> select * from sort_test sort by id desc limit 300;
+---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9719 | aaa |
| 9684 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9340 | aaa |
| 9335 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8377 | aaa |
| 8177 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5223 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5127 | aaa |
| 5083 | aaa |
| 4945 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4728 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3736 | aaa |
| 3683 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2179 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1795 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1574 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 505 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 109 | aaa |
| 19 | aaa |
+---------------+-----------------+--+
- Order By 和 Sort By区别
- Order By全局排序,Sort By局部排序
- 取TopN时,Sort By 比 Order By效率更高
- Distribute By
- 查询语句对指定字段分组
- 通常结合Sort By语句使用,比如同一个地区,不同商家排序,就需要用到这个
- Cluster By
- 分组且排序,等价于 Distribute By 和 Sort By 的结合
-- 使用示例
SELECT col1, col2 FROM t1 CLUSTER BY col1
SELECT col1, col2 FROM t1 DISTRIBUTE BY col1
SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SORT BY col1 ASC, col2 DESC
参考资料
【0】Hive wiki - LanguageManual SortBy
Hive之Order,Sort,Cluster and Distribute By的更多相关文章
- hive 中的Sort By、 Order By、Cluster By、Distribute By 区别
Order by: order by 会对输入做全局排序,因此只有一个reducer(多个reducer无法保证全局有序)只有一个reducer,会导致当输入规模较大时,需要较长的计算时间.在hive ...
- [转]hive中order by,distribute by,sort by,cluster by
转至http://my.oschina.net/repine/blog/296562 order by,distribute by,sort by,cluster by 查询使用说明 1 2 3 4 ...
- hive中order by、distribute by、sort by和cluster by的区别和联系
hive中order by.distribute by.sort by和cluster by的区别和联系 order by order by 会对数据进行全局排序,和oracle和mysql等数据库中 ...
- hive中order by ,sort by ,distribute by, cluster by 的区别(**很详细**)
hive 查询语法 select [all | distinct] select_ condition, select_ condition from table_name a [join table ...
- HiveQL之Sort by、Distribute by、Cluster by、Order By详解
在这里解释一下select语法中的order by.sort by.distribute by.cluster by.order by语法. 一.order by语法 在hiveQL中Order by ...
- hive中order by,sort by, distribute by, cluster by作用以及用法
1. order by Hive中的order by跟传统的sql语言中的order by作用是一样的,会对查询的结果做一次全局排序,所以说,只有hive的sql中制定了order by所有的 ...
- Hive中的order by、sort by、distribute by、cluster by解释及测试
结论: order by:全局排序,这也是4种排序手段中唯一一个能在终端输出中看出全局排序的方法,只有一个reduce,可能造成renduce任务时间过长,在严格模式下,要求必须具备limit子句. ...
- hive 排序 order by sort by distribute by cluster by
order by: order by是全局排序,受hive.mapred.mode的影响. 使用orderby有一些限制: 1.在严格模式下(hive.mapred.mod ...
- Hive中order by,sort by,distribute by,cluster by的区别
一:order by order by会对输入做全局排序,因此只有一个Reducer(多个Reducer无法保证全局有序),然而只有一个Reducer,会导致当输入规模较大时,消耗较长的计算时间.关于 ...
随机推荐
- Websocket 关闭浏览器报错
这个报错,是因为你关闭之后,websocket 自动连接失败造成的 只要在你的websocket 运行的类里面加上: @OnError public void onError(Throwable e, ...
- [Windows Server 2008] 404错误设置方法
★ 欢迎来到[护卫神·V课堂],网站地址:http://v.huweishen.com ★ 护卫神·V课堂 是护卫神旗下专业提供服务器教学视频的网站,每周更新视频. ★ 本节我们将带领大家:如何设置4 ...
- 番茄花园Ghost Win10系统X64位10041装机版2015年4月
转载:系统妈,系统下载地址:http://www.xitongma.com/windows10/2015-04-01/6639.html 番茄花园Ghost Win10系统X64位10041装机版20 ...
- matlab中数据类型
在MATLAB中有15种基本数据类型,分别是8种整型数据.单精度浮点型.双精度浮点型.逻辑型.字符串型.单元数组.结构体类型和函数句柄.这15种基本数据类型具体如下. 有符号整数型:int8,int1 ...
- 习水医院12C RAC 数据库安装文档
环境介绍 OS: Oracle Enterprise Linux 6.4 (For RAC Nodes) DB: GI and Database 12.1.0.2 所需介质 p17694377 ...
- vue工程化之去除Eslint验证
有的时候用vue-cli创建好项目之后,写代码时会出现换行和空格报错,出现这么写错误是什么原因呢? 相信第一次接触时有点摸不着头脑.其实是在你用vue-cli脚手架构建项目时用了ESLint代码检查工 ...
- java_线程的同步机制
1.同步代码块: 使用synchronized包住可能会出现安全问题的代码 import static java.lang.Thread.sleep; class Test01{ public sta ...
- nexus 搭建 maven 私服
nexus 搭建 maven 私服 本机环境 Win 8 JDK 7 Maven 3.2 Nexus 2.11 版本选择 http://www.sonatype.org/nexus/archivedn ...
- Bzoj4899 记忆的轮廓
B. 记忆的轮廓 题目描述 通往贤者之塔的路上,有许多的危机.我们可以把这个地形看做是一颗树,根节点编号为1,目标节点编号为n,其中1-n的简单路径上,编号依次递增,在[1,n]中,一共有n个节点.我 ...
- pom.xml配置引用项目时不生效
1 在项目pom.xml配置中引用项目A,但是编译时,取提数引起是B: 2 原因是:[Java Build Path - Projects] 引用的还是老的项目B,删除该引用即可解决.