*:first-child {

margin-top: 0 !important;

}

body>*:last-child {

margin-bottom: 0 !important;

}

/* BLOCKS

=============================================================================*/

p, blockquote, ul, ol, dl, table, pre {

margin: 15px 0;

}

/* HEADERS

=============================================================================*/

h1, h2, h3, h4, h5, h6 {

margin: 20px 0 10px;

padding: 0;

font-weight: bold;

-webkit-font-smoothing: antialiased;

}

h1 tt, h1 code, h2 tt, h2 code, h3 tt, h3 code, h4 tt, h4 code, h5 tt, h5 code, h6 tt, h6 code {

font-size: inherit;

}

h1 {

font-size: 28px;

color: #000;

}

h2 {

font-size: 24px;

border-bottom: 1px solid #ccc;

color: #000;

}

h3 {

font-size: 18px;

}

h4 {

font-size: 16px;

}

h5 {

font-size: 14px;

}

h6 {

color: #777;

font-size: 14px;

}

body>h2:first-child, body>h1:first-child, body>h1:first-child+h2, body>h3:first-child, body>h4:first-child, body>h5:first-child, body>h6:first-child {

margin-top: 0;

padding-top: 0;

}

a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {

margin-top: 0;

padding-top: 0;

}

h1+p, h2+p, h3+p, h4+p, h5+p, h6+p {

margin-top: 10px;

}

/* LINKS

=============================================================================*/

a {

color: #4183C4;

text-decoration: none;

}

a:hover {

text-decoration: underline;

}

/* LISTS

=============================================================================*/

ul, ol {

padding-left: 30px;

}

ul li > :first-child,

ol li > :first-child,

ul li ul:first-of-type,

ol li ol:first-of-type,

ul li ol:first-of-type,

ol li ul:first-of-type {

margin-top: 0px;

}

ul ul, ul ol, ol ol, ol ul {

margin-bottom: 0;

}

dl {

padding: 0;

}

dl dt {

font-size: 14px;

font-weight: bold;

font-style: italic;

padding: 0;

margin: 15px 0 5px;

}

dl dt:first-child {

padding: 0;

}

dl dt>:first-child {

margin-top: 0px;

}

dl dt>:last-child {

margin-bottom: 0px;

}

dl dd {

margin: 0 0 15px;

padding: 0 15px;

}

dl dd>:first-child {

margin-top: 0px;

}

dl dd>:last-child {

margin-bottom: 0px;

}

/* CODE

=============================================================================*/

pre, code, tt {

font-size: 12px;

font-family: Consolas, "Liberation Mono", Courier, monospace;

}

code, tt {

margin: 0 0px;

padding: 0px 0px;

white-space: nowrap;

border: 1px solid #eaeaea;

background-color: #f8f8f8;

border-radius: 3px;

}

pre>code {

margin: 0;

padding: 0;

white-space: pre;

border: none;

background: transparent;

}

pre {

background-color: #f8f8f8;

border: 1px solid #ccc;

font-size: 13px;

line-height: 19px;

overflow: auto;

padding: 6px 10px;

border-radius: 3px;

}

pre code, pre tt {

background-color: transparent;

border: none;

}

kbd {

-moz-border-bottom-colors: none;

-moz-border-left-colors: none;

-moz-border-right-colors: none;

-moz-border-top-colors: none;

background-color: #DDDDDD;

background-image: linear-gradient(#F1F1F1, #DDDDDD);

background-repeat: repeat-x;

border-color: #DDDDDD #CCCCCC #CCCCCC #DDDDDD;

border-image: none;

border-radius: 2px 2px 2px 2px;

border-style: solid;

border-width: 1px;

font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;

line-height: 10px;

padding: 1px 4px;

}

/* QUOTES

=============================================================================*/

blockquote {

border-left: 4px solid #DDD;

padding: 0 15px;

color: #777;

}

blockquote>:first-child {

margin-top: 0px;

}

blockquote>:last-child {

margin-bottom: 0px;

}

/* HORIZONTAL RULES

=============================================================================*/

hr {

clear: both;

margin: 15px 0;

height: 0px;

overflow: hidden;

border: none;

background: transparent;

border-bottom: 4px solid #ddd;

padding: 0;

}

/* TABLES

=============================================================================*/

table th {

font-weight: bold;

}

table th, table td {

border: 1px solid #ccc;

padding: 6px 13px;

}

table tr {

border-top: 1px solid #ccc;

background-color: #fff;

}

table tr:nth-child(2n) {

background-color: #f8f8f8;

}

/* IMAGES

=============================================================================*/

img {

max-width: 100%

}
-->

1. Load的使用

//在1.x版本中定义long数据类型会报错(用bigint代替)
create table t_load_stu(name string,age bigint)
row format
delimited fields terminated by ',';

1.1本地在本地导入数据,本地数据不会删除(相当于复制)

//overwrite覆盖表中所有的数据
load data local inpath 文件路径 [overwrite] into table 表名
eg:load data local inpath '/root/apps/hive-data/local_load_stu.dat'
   into table t_load_stu;

1.2在hadoop中导入数据,hadoop上的数据被删除(相当于剪切)

load data inpath hadoop上文件目录 into table tab_name
eg：load data inpath '/hivedata/load_stu.dat' into table t_load_stu;
    load data inpath '/hivedata/load_stu.dat' overwrite into table t_load_stu;

2.Insert的使用

 (1)创建源数据库
     create external table org_stu(phone string,real_name string,age int,
        gender int,country string,ip string,creat_time date,creator string)
        row format
        delimited fields terminated by ','
        location '/hivedata';
 //修改表中某个字段的类型
    alter table org_stu change gender gender string;
 //加入本地数据
     load data local inpath '/root/apps/hive-data/org_stu.dat' into table org_stu;
(2)为查询出来的数据插入创建表
   create table t_copy_stu1(phone string,real_name string,age int)
   row format
   delimited fields terminated by ',';
(3)创建分区表
   create table t_copy_partition_stu(phone string,real_name string,ip string)
   partitioned by(creat_time string)
   row format
   delimited fields terminated by ',';
（4）多重插入准备两张表
    create table t_mult_stu_1(phone string,real_name string)
    row format
    delimited fields terminated by ',';
    create table t_mult_stu_2(phone string,gender string,age int)
    row format
    delimited fields terminated by ',';

2.1将查询出来的结果插入到一个表中

 语法:insert [overwrite]/into table  表名
      select 字段 from 表名 where 条件
 eg:insert overwrite  table t_copy_stu1 select phone,real_name,age from org_stu;

2.2将查询出来的结果作为插入到表中的某个分区中(自动分区模式)

设置自动分区模式:set hive.exec.dynamic.partition.mode=true;
语法：insert overwrite table 表名  partition (分区字段名)
      select 字段 from 表名 where 条件
 eg:insert overwrite table t_copy_partition_stu partition(creat_time)
    select phone,real_name,ip,substring(creat_time,0,10) as creat_time from org_stu;

2.3多重插入

语法：from 表名
     insert into table 表名  select 字段名称 where 查询条件
     ...
eg:from org_stu
   insert into table t_mult_stu_1 select phone,real_name
   insert into table t_mult_stu_2 select phone,gender,age;

3Select

 表:
  create table t_stu_limit_20(sno int,name string,age int,sdp string)
  row format
  delimited fields terminated by ',';
 注:1、set hive.exec.reducers.bytes.per.reducer=<number>，设置每个reducer最大处理的数据大小(单位:字节)来计算出需要多少个reducer去处理,默认大小是:256000000
    2、set hive.exec.reducers.max=<number>,设置reducer的最大值,如果上面设置的算出来的reducer的个数超过max的值，以max的值为准.如果上面的计算出reudcer小于max,以小的为准。（默认值是：1009）
    3、set mapreduce.job.reduces=<number>固定的设置reducer的个数

3.1 distribute by(字段)根据指定的字段将数据分到不同的reducer,且分发算法是hash散列算法

  注:跟reducer的个数有关，不具有排序
  eg:select * from t_stu_limit_20 distribute by sno;

3.2 sort by(字段) 不是全局排序，其在数据进入reducer前完成排序。

  注:如果用sort by 进行排序并且设置mapred.reduce.tasks>1,则sort by 只能保证每个reducer的输出有序,不保证全局有序
  eg:select * from t_stu_limit_20 distribute by sno sort by age;

3.3 order by(字段) 会对输入做全局排序

  注：只有一个reducer,缺点是当输入规模较大时需要较长的计算时间
  （Number of reduce tasks determined at compile time: 1）
  eg:select * from t_stu_limit_20 order by age;

3.4 cluster by(字段) 除了具有distribute by 功能外还可以对该字段进行排序

  注:如果分桶和sort是同一个字段，此时cluster by =distribute by +sort by
  eg:insert overwrite local directory '/root/test' select * from t_stu_limit_20 cluster by sno;

分桶的作用：最大的作用是用来提高join操作的效率

补充:如何才能在yarn的管理控制台上查看已经完成的job信息？

需要在hadoop集群上启动jobhistory服务器,执行的命令是:mr-jobhistory-daemon.sh start
historyserver

4 join

注：

1)目前hive只支持等值得join,不支持非等值的连接,因为非等值join很难转化为map/reducer任务

2)可以join多于2个表,执行流程分析

 情况一:如果join多个表时,join key 是同一个，则join会被转化为单个map/reduce任务
     eg:select a.val,b.val,c.val from a join b on(a.key=b.key1) join c
        on (c.key=b.key1)
 情况二:如果join key非同一个,则join会被转化为多个map/reduce的任务
     eg:select a.val,b.val,c.val from a join b on(a.key=b.key1) join c
        on(c.key=b.key2)
      分析:join被转化为2个map/reduce任务。因为b.key1用于第一次join条件，而b.key2用于第二次join.

3)join时，每次map/reduce的任务逻辑

  说明:reducer会缓存join序列中除了最后一个表所有表的记录,再通过最后一个表将结果序列化到文件系统中。
  优点:这一实现有助于在reduce端减少内存的使用量。
  注：在实践中应该把最大的那个表放在最后，否则会因为缓存浪费大量的缓存。
  eg:1)select a.val,b.val,c.val from a join b on(a.key=b.key1) join c
        on (c.key=b.key1)
       说明:所有的表都使用同一个join key(使用一次map/reduce任务计算)。reduce端会缓存a表和b表的记录，然后每次取得一个c表记录就计算一次join结果。
     2）select a.val,b.val,c.val from a join b on(a.key=b.key1) join c
        on(c.key=b.key2)
       说明:join key不同，这里用了2次map/reduce任务。第一次缓存a表用b表序列化，第二次缓存第一次map/reduce任务的结果，然后用c表序列化。

4)LEFT、RIGHT、FULL 、OUTER关键字用于处理join中空记录的情况

  说明:和数据库处理的差不多

5)join发生在where字句之前

  实际场景问题:select a.val,b.val from a left outer join(a.key=b.key) where a.ds='2016-12-30' and b.ds='2016-12-30'
  问题描述:如果b找不到对应的a表的记录，b表所有列都会列出null,包括ds列。也就是说join会过滤b表中不能找到匹配a表join key的所有记录，导致LEFT OUTER与where子句无关。
  解决方案:在left out时使用条件
          select a.val,b.val from a left outer join on(a.key=b.key and b.ds='2016-12-29' and a.ds='2016-12-29')
 说明:这一查询的结果是预先在join阶段过滤的，所以不会存在上述的问题。这一逻辑可以用于right 和full类型的join中。

6)join是不能交换位置的

 说明:无论是left还是right都是通过左连接的。
 eg:select a.val1,a.val2,b.val,c.val from a join b on(a.key=b.key) left outer join c on(a.key=b.key)
分析:先join a表到b表的记录，丢弃掉所有join key中不匹配的记录，然后用这中间结果和c表做join.也就是说，就是当一个key在a表和c表都存在，但在b表不存在的时候，整个记录在第一次join，即a join b的时候被丢弃掉了(包括a.val1,a.val2和a.key),然后我们在跟c表join的时候，如果c.key与a.key或b.key相等，就会得到这样的结果:null,null,null,c.val

5hive参数配置

5.1hive的命令行

  语法结构：
  hive [-hiveconf x=y]*[<-i filename>]*[<-f filename>|-e query-string][-s]
  说明:
   1、-i从文件中初始化hql;
   2、-e从命令行执行指定的hql
   3、-f执行hql脚本
   4、-v输出执行的hql语句到控制台
  eg：命令:hive -e 'use db2;select * from t_load_stu';
      结果:Time taken: 1.02 seconds
            OK
            zs      12
            ls      15
            wu      16

5.2参数配置方式

1）配置文件(全局有效)
说明:用户自定义配置文件：hive-site.xml
     默认配置文件:hive-default.xml
注: 1>用户自定义配置文会覆盖默认配置;
    2>hive会读取hadoop的配置，因为hive作为hadoop的客户端启动的,hive配置会覆盖hadoop的配置
 2)命令行参数（对hive启动实例有效）
   说明:-hiveconf param=value
   eg:-hiveconf hive.root.logger=info,console
 3)参数声明（对hive的连接session有效）
   说明:可以在hql中使用set关键字来设定参数
   eg:1>set hive.exec.reducers.bytes.per.reducer=>每个reduce task的平均负载数据量，hive会估算我们的总数据量，然后用总数据量除以上述参数值，就能得到需要运行的reduce task的数量
      2>set hive.exec.reducers.max=>设置reduce task的上限
      3>set mapreduce.job.reduces=>指定固定的reduce task的数量
   注:这个参数在必要时<业务逻辑决定只能用一个reduce task>会忽略的(例如order by)

6hive函数

创建测试需要的表,方便函数的测试
   1、create table dual(id string);
   2、load一个文件(一行，一个空格)到dual表中

6.1自定义hive的函数(基于java开发语言)

 步骤:第一步：先开发一个java类，继承UDF，并重载evaluate方法；
      第二步：打成jar包上传到服务器；
      第三步：将jar包添加到hive的classpath(命令:add jar jar包的全路径)；
      第四步：创建临时函数与开发好的java class关联(命令:create temporary function 方法名称 as '定义方法类名的全路径')；
 eg:
    1)创建一个java类,集成UDF
      import org.apache.commons.lang.StringUtils;
      import org.apache.hadoop.hive.ql.exec.UDF;
      public class MyHiveFunctionUdf extends UDF {
        // 重载方法
        public String evaluate(String inStr) {
            if (StringUtils.isEmpty(inStr)) {
                return inStr;
            }
            return inStr.toLowerCase();
        }
        public int evaluate(Integer... sumParams) {
            if (sumParams == null || sumParams.length == 0)
                return 0;
            int totalValue = 0;
            for (Integer sumParam : sumParams) {
                if (sumParam != null) {
                    totalValue += sumParam;
                }
            }
            return totalValue;
        }
        public String evaluate(String... inStrs) {
            StringBuffer sb = new StringBuffer();
            if (inStrs == null || inStrs.length == 0) {
                return "";
            }
            for (String inStr : inStrs) {
                sb.append(inStr);
            }
            return sb.toString();
        }
    }
  2)打成jar添加到hive的classpath中去
    命令:hive> add jar /root/hive-udf.jar;
    结果:Added [/root/hive-udf.jar] to class path(加载本地的classpath)
         Added resources: [/root/hive-udf.jar](加载到distribute cache中分发到各个map/reduce中)
 3)创建临时函数myfuns与com.hive.udf.MyHiveFunctionUdf类映射
   命令:create temporary function myfuns as 'com.hive.udf.MyHiveFunctionUdf';
   结果:OK
        Time taken: 1.194 seconds
 4)在hql中使用函数
   命令:select myfuns('1','2','3') from dual;
   结果:123
   命令:select myfuns(1,2) from dual;
   结果:3

6.2Transform实现hive自定义的函数

说明:hive的transform关键字提供了在sql中调用自写脚本的功能
eg:
 1、创建容纳json数据的表
    create table t_json(json_line string)
    row format
    delimited fields terminated by '\001';
2、使用get_json_object对json表中的json字符串进行解析并保存到新的表中
   create table t_rating
   as select get_json_object(json_line,'$.movie') as movieid,
             get_json_object(json_line,'$.rate') as rate,
             get_json_object(json_line,'$.timeStamp') as timestring,
             get_json_object(json_line,'$.uid') as userid
      from t_json
3、通过transform方式创建通过时间戳获取weekday
   python的脚本:
    #!/bin/python
    import sys
    import datetime
    for line in sys.stdin:
        line = line.strip()
        movieid, rating, unixtime,userid = line.split('\t')
      weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
     print '\t'.join([movieid, rating, str(weekday),userid])
   创建一张记录表:
    CREATE TABLE u_data_new (
                  movieid INT,
                  rating INT,
                  weekday INT,
                  userid INT)
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY '\t';
  命令:add file /root/apps/hive-data/weekday-mapper.py(向每个map/reduce发送该脚本)
  执行：
    INSERT OVERWRITE TABLE u_data_new
    SELECT
      TRANSFORM (movieid , rate, timestring,userid)
      USING 'python weekday-mapper.py'---使用脚本函数
      AS (movieid, rating, weekday,userid)
    FROM t_rating;
  校验是否成功:
    select distinct(weekday) from u_data_new limit 10;

7hvie特殊分割符处理

hive读取数据的机制：1、首选用InputFormat<默认是:org.apache.hadoop.mapred.TextInputFormat>的具体实现类读入文件数据，返回一条一条的记录(可以是行或自定义逻辑中的行)
                  2、然后用SerDe<默认是:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe>的一个具体实现类，对上面返回一条一条的记录进行字段切割

7.1通过自定义InputFormat解决特殊分隔符的问题

eg:
   1、原始数据格式:zs||24
                 ls||27
   2、重写InputFormat
        import java.io.IOException;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapred.FileSplit;
        import org.apache.hadoop.mapred.InputSplit;
        import org.apache.hadoop.mapred.JobConf;
        import org.apache.hadoop.mapred.LineRecordReader;
        import org.apache.hadoop.mapred.RecordReader;
        import org.apache.hadoop.mapred.Reporter;
        import org.apache.hadoop.mapred.TextInputFormat;
        import com.google.common.base.Charsets;
        public class TextInputFormatWrapper extends TextInputFormat {
        @Override
        public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
            reporter.setStatus(genericSplit.toString());
            String delimiter = job.get("textinputformat.record.delimiter");
            byte[] recordDelimiterBytes = null;
            if (null != delimiter) {
                recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
            }
            LineRecordReader lineRecordReader = new LineRecordReader(job, (FileSplit) genericSplit, recordDelimiterBytes);
            return new LineRecordReaderWrapper(lineRecordReader);
        }
        public static class LineRecordReaderWrapper implements RecordReader<LongWritable, Text> {
            private LineRecordReader lineRecordReader;
            public LineRecordReaderWrapper(LineRecordReader lineRecordReader) {
                super();
                this.lineRecordReader = lineRecordReader;
            }
            @Override
            public void close() throws IOException {
                lineRecordReader.close();
            }
            @Override
            public LongWritable createKey() {
                return lineRecordReader.createKey();
            }
            @Override
            public Text createValue() {
                return lineRecordReader.createValue();
            }
            @Override
            public long getPos() throws IOException {
                return lineRecordReader.getPos();
            }
            @Override
            public float getProgress() throws IOException {
                return lineRecordReader.getProgress();
            }
            @Override
            public boolean next(LongWritable key, Text value) throws IOException {
                boolean hasNext = lineRecordReader.next(key, value);
                if (hasNext && value != null) {
                    String replaceResult = value.toString().replaceAll("\\|\\|", "\\|");
                    value.set(replaceResult);
                }
                return hasNext;
            }
        }
       }
 3、创建表使用该InputFormat
    create table t_bi(name string,age int)
    row format delimited fields terminated by '|'
    stored as inputformat 'com.hadoop.extend.TextInputFormatWrapper'
            outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
 4、加载数据
    load data local inpath '/root/apps/hive-data/bi.dat' into table t_bi;
 5、测试结果是
      select * from t_bi;
      zs      24
      ls      27

7.2使用RegexSerDe通过正则表达式来抽取字段

eg:
   1、 创建表,使用serde为org.apache.hadoop.hive.serde2.RegexSerDe
    create table t_bi_reg(name string,age int)
       row format
       serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
       with serdeproperties(
        'input.regex'='(.*)\\|\\|(.*)',
        'output.format.string'='%1$s %2$s'
        )
       stored as textfile;
    2、加载数据
    load data local inpath '/root/apps/hive-data/bi.dat' into table t_bi_reg;
    3、测试结果是:
        select * from t_bi_reg;
        zs      24
        ls      27
注：在数据量大的情况下上述方式比该方式效率高

练习

需求

有如下访客访问次数统计表：t-access-time

输出的报表:t-access-times-accumulate

分析

第一步:求个用户的月访问次数
select user_name,month,sum(rating)as rating from t_access_time group by user_name,month;
结果:
A       2015-01 33
A       2015-02 10
B       2015-01 30
B       2015-02 15
第二步:将自己的月总访问次数与自己内连接
select
    A.*,B.*
from
    (select user_name,month,sum(rating)as rating from t_access_time group by user_name,month)A
        inner join
    (select user_name,month,sum(rating)as rating from t_access_time group by user_name,month)B
on A.user_name=B.user_name;
结果:
    A       2015-01 33      A       2015-01 33
    A       2015-01 33      A       2015-02 10
    A       2015-02 10      A       2015-01 33
    A       2015-02 10      A       2015-02 10
    B       2015-01 30      B       2015-01 30
    B       2015-01 30      B       2015-02 15
    B       2015-02 15      B       2015-01 30
    B       2015-02 15      B       2015-02 15
第三步:从上一步的结果中进行分组查询,分组的字段是 user_name,month,查询条件是:A.month>=B.month

实现

1、在hive中创建这样的表
    create table t_access_time(user_name string,month string,rating int)
    row format delimited fields terminated by ',';
2、加载测试数据:
    load data local inpath '/root/apps/hive-data/t_access_time.dat' into table t_access_time;
3、最终hql语句
    select
        A.user_name,A.month,Max(A.rating),sum(B.rating)
    from
        (select user_name,month,sum(rating)as rating from t_access_time group by user_name,month)A
            inner join
        (select user_name,month,sum(rating)as rating from t_access_time group by user_name,month)B
            on A.user_name=B.user_name
    where  A.month>=B.month
    group by A.user_name,A.month
 4、执行结果:
    A       2015-01 33      33
    A       2015-02 10      43
    B       2015-01 30      30
    B       2015-02 15      45

hive学习笔记的更多相关文章

hive学习笔记之一：基本数据类型
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之三：内部表和外部表
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之四：分区表
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之五：分桶
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之六：HiveQL基础
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之七：内置函数
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之九：基础UDF
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
hive学习笔记之十：用户自定义聚合函数(UDAF)
欢迎访问我的GitHub 这里分类和汇总了欣宸的全部原创(含配套源码):https://github.com/zq2599/blog_demos 本篇概览本文是<hive学习笔记>的第十 ...
hive学习笔记之十一：UDTF
欢迎访问我的GitHub https://github.com/zq2599/blog_demos 内容:所有原创文章分类汇总及配套源码,涉及Java.Docker.Kubernetes.DevOPS ...
Hive学习笔记（一）
摘要: Hive 是建立在 Hadoop 上的数据仓库基础构架.它提供了一系列的工具,可以用来进行数据提取转化加载(ETL),这是一种可以存储.查询和分析存储在 Hadoop 中的大规模数据的机制.H ...

随机推荐

Windows无法完成安装，若要在此计算机上安装Windows，请中心启动安装。
现在安装系统已经很简单了,我觉得U盘启动的话两步就差不多了, 壹:设置BIOS,将U盘启动作为系统默认启动选项贰:直接进去大白菜之类的,一键安装... 今天终于看到第三部了, 报错:Windows无 ...
1.1、MyEclipse自定义注释
一.修改进入路径: Window->Preference->Java->Code Style->Code Template->Comments 二:编辑自定义注释文件 ...
Linux C 收藏
某招聘要求:熟悉高性能分布式网络服务端设计开发,熟悉epoll.多线程.异步IO.事件驱动等服务端技术: <UNIX环境高级编程(第3版)>apue.h等源码文件的编译安装 <UNI ...
word20161224
V.34 V.90 validation / 验证 value entry / 值项 variable / 变量 variable bit rate, VBR / 可变传输率 VBR, variabl ...
Java中的24种设计模式与7大原则
一.创建型模式 1.抽象工厂模式(Abstract factory pattern): 提供一个接口, 用于创建相关或依赖对象的家族, 而不需要指定具体类.2.生成器模式(Builder patter ...
ProgressBar---进度条
最近在处理标题进度条时,耗费了一些时间,现在总结一下ProgressBar的相关知识,有不对的地方请大神们批评指正! 进度条主要有以下三种: 1.对话框进度条 2.标题进度条注意:requestWi ...
Selenium Xpath Tutorials - Identifying xpath for element with examples to use in selenium
Xpath in selenium is close to must required. XPath is element locator and you need to provide xpath ...
412. Fizz Buzz
https://leetcode.com/problems/fizz-buzz/ 没什么好说的,上一个小学生解法 class Solution(object): def fizzBuzz(self, ...
Nginx + Tomcat Windows下的负载均衡配置
Nginx + Tomcat Windows下的负载均衡配置一.为什么需要对Tomcat服务器做负载均衡? Tomcat服务器作为一个Web服务器,其并发数在300-500之间,如果超过50 ...
python 装饰器
#!/usr/bin/env python3 #-*-encoding:utf-8-*- def w3(*args, **kwargs): ') def w1(): def ww1(func): de ...

hive学习笔记

1. Load的使用

1.1本地在本地导入数据,本地数据不会删除(相当于复制)

1.2在hadoop中导入数据,hadoop上的数据被删除(相当于剪切)

2.Insert的使用

2.1将查询出来的结果插入到一个表中

2.2将查询出来的结果作为插入到表中的某个分区中(自动分区模式)

2.3多重插入

3Select

3.1 distribute by(字段)根据指定的字段将数据分到不同的reducer,且分发算法是hash散列算法

3.2 sort by(字段) 不是全局排序，其在数据进入reducer前完成排序。

3.3 order by(字段) 会对输入做全局排序

3.4 cluster by(字段) 除了具有distribute by 功能外还可以对该字段进行排序

分桶的作用：最大的作用是用来提高join操作的效率

补充:如何才能在yarn的管理控制台上查看已经完成的job信息？

4 join

注：

1)目前hive只支持等值得join,不支持非等值的连接,因为非等值join很难转化为map/reducer任务

2)可以join多于2个表,执行流程分析

3)join时，每次map/reduce的任务逻辑

4)LEFT、RIGHT、FULL 、OUTER关键字用于处理join中空记录的情况

5)join发生在where字句之前

6)join是不能交换位置的

5hive参数配置

5.1hive的命令行

5.2参数配置方式

6hive函数

6.1自定义hive的函数(基于java开发语言)

6.2Transform实现hive自定义的函数

7hvie特殊分割符处理

7.1通过自定义InputFormat解决特殊分隔符的问题

7.2使用RegexSerDe通过正则表达式来抽取字段

练习

需求

有如下访客访问次数统计表：t-access-time

输出的报表:t-access-times-accumulate

分析

实现

hive学习笔记的更多相关文章

随机推荐

热门专题