create table hive_1(id string,name string ,gender string)
row format delimited fields terminated by ','
stored as TEXTFILE;

load data local inpath '/luozt/hive_001.txt' into table hive_1 ;

create EXTERNAL table hive_2(id string,name string ,gender string)
row format delimited fields terminated by ','
stored as TEXTFILE;

load data inpath '/luo/hive_001.txt' into table hive_2 ;
//查询记录数
count 'hive_1'
//清空表
truncate table log_struct;

//删除表
drop table log_struct;

create table partition_table
(name string ,salary float,gender string,level string)
partitioned by(dt string,dept string)
row format delimited fields terminated by ','
stored as TEXTFILE;

desc partition_table

show partitions partition_table;//查看分区表的分区信息
//给分区表插入数据
load data local inpath '/luozt/par.txt' into table partition_table partition(dt='2014-04-01',dept='yonyu');

添加分区:
alter table partition_table add partition(dt='2014-04-03',dept='yonyou3') location '/user/hive/warehouse/luo.db/partition_table/dt=2014-04-03/dept=yonyou3';

删除分区:
alter table partition_table drop partition(dt='2014-04-03',dept='yonyou4')

select * from partition_table where salary>7600;
//嵌套
from (select name,salary from partition_table)e select e.name,e.salary where e.salary>7600;

//in 的用法

select * from partition_table where salary in(7000,6700);

//case的用法
select name,salary,
case
when salary<6800 then 'L1'
when salary>6800 and salary <8000 then 'L2'
when salary>8100 then 'L3'
else 'L0'
end as salary_level
from partition_table;

//having的用法

select gender,sum(salary) from partition_table group by gender;

//练习join
create table group1 (user string,score int)
row format delimited fields terminated by ','
stored as TEXTFILE;

//
create table group_join (user string,class string)
row format delimited fields terminated by ','
stored as TEXTFILE;

//普通的join
select b.class,a.score from group1 a join group_join b on (a.user=b.user);
//有个表很小时用mapjoin(b) b为小表
select /*+MAPJOIN(b)*/ b.class,a.score from group1 a join group_join b on (a.user=b.user);
//left Semi join

//分组
select user ,sum(score) from group1 group by user; ----分组时select的字段要全部作为group字段
//优化
set hive.map.aggr=true

//order by
create table orderby_test (user string,class string,math int,english int)
row format delimited fields terminated by ','
stored as TEXTFILE;

//默认升序
select * from orderby_test order by math; --desc改为降序,若将set hive.mapred.mode=strict;则要加上limit

//sort by 不受set hive.mapred.mode=strict的影响 可以指定 set mapred.reduce.tasks=<number> sort by 只会在么给reduce上进行排序,reduce输出的数据时有序的,提高全局排序的效率

//union all hive不支持顶层union ,只能将union封装在子查询中,且必须为union的查询输出定义别名
select * from (select count(*) from group1 union all select count(*) from orderby_test) temp;

//记得这种用法
select name,height,mark
from
(select name,height,'0' as mark from a
union all
select name height,'1' as mark from b) t;

//索引
create table index_test(id int,name string) partitioned by (dt string) row format delimited fields terminated by ',' stored as TEXTFILE;
//先创建一个临时表
create table temp(id int,name string ,dt string ) row format delimited fields terminated by ',' stored as TEXTFILE;
//动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;

insert overwrite table index_test partition(dt) select id,name,dt from temp;

//索引 创建索引时要有partition 否则不行
create index index1 on table index_test(id) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with DEFERRED REBUILD;

alter index index1 on index_test rebuild;

show index on index_test;
show partitions index_test;

//Bucket 桶 就是抽样

create table tb_tmp(id int,age int,name string,timeflag bigint) row format delimited fields terminated by ',' ;
create table tb_stu(id int,age int,name string,timeflag bigint) clustered by(id) sorted by (age) into 5 buckets row format delimited fields terminated by ','

insert into table tb_stu select id,age,name,timeflag from tb_tmp;

利用桶进行查询
select * from tb_stu tablesample(bucket 1 out of 5 on id);

//存储类型和复合数据类型
rcfile 是直接load不进去的 要用临时表insert进去

//array
create table log_array(ip string,uid array<bigint>) partitioned by (dt string) row format delimited fields terminated by ',' collection items terminated by '|' stored AS TEXTFILE;

load data local inpath '/luo/log_array.txt' into table log_array partition(dt=20150902);
//查询array中的值
select uid[1] from log_array;
select ip,size(uid) from log_array where dt=20150902;
select ip from log_array where dt=20150902 and array_contains(uid,4732974)

//map
create table log_map(ts string,ip string,type string,logtype string,request Map<string,string>,response Map<string,string>)
row format delimited fields terminated by '#' collection items terminated by '&' Map keys terminated by '=' stored as TEXTFILE;

//查询
select request['src'] from log_map;

//struct
create table log_struct(ip string,user struct<name:string,age:int>)
row format delimited fields terminated by ','
collection items terminated by '#'
stored as TEXTFILE;

数据:192.168.1.1,wow#23
192.168.1.1,wow#23
192.168.1.1,wow#23
192.168.1.1,wow#23
192.168.1.1,wow#23
192.168.1.1,wow#23

select user.name from log_struct;

hadoop-hive学习笔记的更多相关文章

  1. hive学习笔记之一:基本数据类型

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  2. hive学习笔记之三:内部表和外部表

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  3. hive学习笔记之四:分区表

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  4. hive学习笔记之五:分桶

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  5. hive学习笔记之六:HiveQL基础

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  6. hive学习笔记之七:内置函数

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  7. hive学习笔记之九:基础UDF

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  8. hive学习笔记之十:用户自定义聚合函数(UDAF)

    欢迎访问我的GitHub 这里分类和汇总了欣宸的全部原创(含配套源码):https://github.com/zq2599/blog_demos 本篇概览 本文是<hive学习笔记>的第十 ...

  9. hive学习笔记之十一:UDTF

    欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...

  10. Hadoop入门学习笔记---part4

    紧接着<Hadoop入门学习笔记---part3>中的继续了解如何用java在程序中操作HDFS. 众所周知,对文件的操作无非是创建,查看,下载,删除.下面我们就开始应用java程序进行操 ...

随机推荐

  1. loadrunner脚本篇——Run-time Settings之ContentCheck

    运用场景(很少用到): ContentCheck的设置可用来让VuGen检测存在错误的站点页面.如果被测的Web应用没有使用自定义的错误页面,那么这里不用添加规则,因为LR在回放时候,可以默认的捕捉到 ...

  2. loadrunder之脚本篇——Run-time Settings之Pacing

      As soon as the previous iteration ends 前一个迭代一结束就尽可能快的开始新一轮的迭代   After the previous iteration ends ...

  3. C++中引用编译过的C代码为什么要用“extern c”

    函数经过编译系统的翻译成汇编,函数名对应着汇编标号.  因为C编译函数名与得到的汇编代号基本一样,如:fun()=>_fun, main=>_main  但是C++中函数名与得到的汇编代号 ...

  4. 每天一个Linux命令(50)netstat命令

        netstat命令用来打印Linux中网络系统的状态信息,可让你得知整个Linux系统的网络情况.     (1)用法:     用法:  netstat [选项参数]     (2)功能: ...

  5. 【leetcode刷题笔记】Sudoku Solver

    Write a program to solve a Sudoku puzzle by filling the empty cells. Empty cells are indicated by th ...

  6. Linux的XServer

    Moblin Core是在Gnome Mobile的平台上建立.我以前玩Linux,提交的都和图像没有关系,连Xwindows都不用启动,开机后直接进入文本命令行,所以这方面了解得很少,需要学习一下, ...

  7. Kubernetes Ingress

    Kubernetes关于服务的暴露主要是通过NodePort方式,通过绑定node主机的某个端口,然后进行pod的请求转发和负载均衡,但这种方式下缺陷是 Service可能有很多个,如果每个都绑定一个 ...

  8. Jquery 获取地址位置

    直接在浏览器地址 输入: http://pv.sohu.com/cityjson?ie=utf-8 可以查看数据格式 引入一个搜狐的js库: <script src="http://p ...

  9. linux 基本命令___0001

    参考公众号:生信媛 参考链接:每天一个linux命令(61):wget命令 参考链接:<sort帮你排序>-linux命令五分钟系列之二十六 参考链接:每天一个linux命令(34):du ...

  10. uvalive 6932

    三个串必须要一起dp 之前刚学了dfs的记忆化搜索的dp方式 觉得很舒服 现学现卖然后两个小时都没有做出来 优化1:之前在dfs中 对每一个pos都会枚举所有可能的组合 结合当前状态来产生新的状态 来 ...