hive不分区增量更新

insert overwrite table ods.zeg_so
select
*,
case when zsm.id is not null then cast(current_timestamp as string) else zs.etl_update end etl_update
from ods.zeg_so_mid zsm
full join ods.zeg_so zs on zsm.id=zs.id
----------------------------------------------------------
insert overwrite table data_center.test_no_partition
select tmp.id,tmp.name,tmp.age
from tmp.temp_test_no_partition tmp
full join data_center.test_no_partition org
on tmp.id=org.id

#hive全连接

insert overwrite table data_center.test_no_partition
select tmp.id,tmp.name,tmp.age from tmp.temp_test_no_partition tmp
union all
select org.id,org.name,org.age from data_center.test_no_partition org
left outer join tmp.temp_test_no_partition b
on org.id=b.id
--------------------------------------------------------------
insert overwrite table data_center.test_no_partition
select tmp.id,tmp.name,tmp.age from tmp.temp_test_no_partition tmp
union all
select b.id,b.name,b.age from data_center.test_no_partition org
left outer join tmp.temp_test_no_partition b
on org.id=b.id
----------------------------------------------------------------
insert overwrite table data_center.test_no_partition
select tmp.id,tmp.name,tmp.age from tmp.temp_test_no_partition tmp
full join data_center.test_no_partition org on tmp.id=org.id
--------------------------------------------------------------------
insert overwrite table data_center.test_no_partition
select org.id,org.name,org.age from tmp.temp_test_no_partition tmp
full join data_center.test_no_partition org on tmp.id=org.id
--------------------------------------------------------------------------------
不分区增量更新 OK
insert overwrite table data_center.test_no_partition
select tmp.id,tmp.name,tmp.age from tmp.temp_test_no_partition tmp
union all
select org.id,org.name,org.age from data_center.test_no_partition org
left outer join tmp.temp_test_no_partition b
on org.id=b.id
where b.id is null;

分区：增量更新
INSERT OVERWRITE TABLE %s.%s PARTITION(%s) select %s from %s''' % (job_info_map["w_database"], job_info_map["w_table"], job_info_map["w_partition_name"],
colums_str, "temp." + job_info_map["temp_table"])

hive_sql = "set hive.exec.dynamic.partition.mode=nonstrict;"
hive_sql += "set hive.exec.max.dynamic.partitions.pernode=1000;"
hive_sql += "set mapreduce.reduce.shuffle.input.buffer.percent=0.5;"
hive_sql += "INSERT OVERWRITE TABLE " + hive_table + " PARTITION(" + job_info_map["w_partition_name"] + ")" + "\nSELECT " + colums_str + " FROM " + temp_table + " UNION ALL SELECT a.* FROM " + hive_table + " a LEFT OUTER JOIN " + temp_table + " b on "

#增量取数据条件
update_time>=subdate(current_date,1)
COALESCE(update_time,create_Time)>=subdate(current_date,1)

-------------------------------------------------------------------
#分区增量更新语句：
insert overwrite table data_center.test_partition partition(date_id='2017-06-13')
select tmp.id,tmp.name,tmp.age from tmp.temp_test_partition tmp
union all select org.id,org.name,org.age from data_center.test_partition org
left outer join tmp.temp_test_partition b
on org.id=b.id
where b.id is null;
报错：SemanticException [Error 10044]: Line 1:23 Cannot insert into target table because column number/types are different ''2017-06-13'': Table insclause-0 has 3 columns,
but query has 4 columns

CREATE TABLE `tmp.temp_test_partition`(
`id` int,
`name`
string,
`age` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.l
azy.LazySimpleSerDe'
WITH SERDEPROPERTIES ( 'field.delim'=',', 'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputF
ormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFor
mat'
LOCATION
'hdfs://master:9000/user/hive/warehouse/tmp.db/temp_test_partiti
on' TBLPROPERTIES ( 'transient_lastDdlTime'='1497492633')

alter table tmp.temp_test_partition set serdeproperties ('field.delim'=',','serialization.format'=',')

#分区增量更新语句：OK
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
insert overwrite table data_center.test_partition partition(date_id)
select tmp.id,tmp.name,tmp.age,'2017-06-15' as date_id from tmp.temp_test_partition tmp
union all select a.* from data_center.test_partition a
left outer join tmp.temp_test_partition b on a.id=b.id where b.id is null and a.date_id in ('2017-06-14','2017-06-15','2017-06-16');

#删除部分数据
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
insert overwrite table data_center.test_partition partition (date_id) select id,name,age,date_id from data_center.test_partition limit 2;

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
insert overwrite table data_center.test_partition partition (date_id) select id,name,age,date_id from data_center.test_partition where name in ('lisi','ccc')

hive不分区增量更新的更多相关文章

使用hive增量更新
目录 1.增量更新 2.对第一种情况 2.1.准备工作 2.2.更新数据 3.对第二种情况 3.1.准备工作 3.2.方法1 3.3.方法2 参考文末文章,加上自己的理解. 1.增量更新有一个 ba ...
数仓增量更新hive实现
注:参考文末文章,加上自己的理解. 1.增量更新有一个 base_table 表存放的是 12 月 15 日之前的所有数据,当 12 月 16 日的数据产生后,生成了一个 incremental_t ...
大数据系列之数据仓库Hive中分区Partition如何使用
Hive系列博文,持续更新~~~ 大数据系列之数据仓库Hive原理大数据系列之数据仓库Hive安装大数据系列之数据仓库Hive中分区Partition如何使用大数据系列之数据仓库Hive命令使用 ...
hive表分区相关操作
Hive 表分区 Hive表的分区就是一个目录,分区字段不和表的字段重复创建分区表: create table tb_partition(id string, name string) PARTIT ...
谈谈混合 App Web 资源的打包与增量更新
综述移动 App 的运行环境具有带宽不稳定,流量收费,启动速度比较重要等特点,所以混合 App 如何加载 Web 资源并不是一个新问题.本文目的是总结出一种资源打包下载的思路和方案,并且提供一种打包 ...
SSIS Design2：增量更新
一般来说,ETL实现增量更新的方式有两种,第一种:记录字段的最大值,如果数据源中存在持续增加的数据列,记录上次处理的数据集中,该列的最大值:第二种是,保存HashValue,快速检查所有数据,发现异动 ...
android studio增量更新
一.概述 1.1 概念增量更新即是通过比较本机安装版本和想要安装版本间的差异,产生一个差异安装包,不需要从官网下载并安装全量安装包,更不需要将本机已安装的版本下载,而仅仅只是安装此差异安装包 ...
Android 增量更新（BSDiff / bspatch）
Android 增量更新 BSDiff / bspatchhttp://www.daemonology.net/bsdiff/android的代码目录下 \external\bsdiff bsdiff ...
【转载】Unity 合理安排增量更新（热更新）
原帖地址:由于我看到的那个网站发的这篇帖子很大可能是盗贴的,我就暂时不贴地址了.避免伤害原作者原版写的有点乱,我个人修改整理了下. --------------------------------- ...

随机推荐

JavaScript高程第三版笔记-函数表达式
1⃣️递归阶乘函数: function factorial(num){ ){ ; } ); } } 改装一:(arguments.callee指向正在执行的函数的指针,实现解耦) function ...
使用PowerShell 自动创建DFS复制组
运行环境:Windows Server 2012 R2 DFS 复制概述 DFS复制组 PowerShell脚本命令需要注意的是DFS依赖域,若此服务器未存在于域控上,或未存在域内,则此脚本会报错 ...
python 爬虫基于requests模块的get请求
需求:爬取搜狗首页的页面数据 import requests # 1.指定url url = 'https://www.sogou.com/' # 2.发起get请求:get方法会返回请求成功的响应对 ...
【BZOJ4668】冷战（并查集）
Description 1946 年 3 月 5 日,英国前首相温斯顿·丘吉尔在美国富尔顿发表"铁幕演说",正式拉开了冷战序幕.美国和苏联同为世界上的"超级大国" ...
2019牛客暑期多校训练营（第一场）-E（DP）
题目链接:https://ac.nowcoder.com/acm/contest/881/E 题意:求可分解成n个AB和m个BA的字符串的个数. 思路: 首先根据贪心思想,前n个A可作为AB的A,后m ...
制作U盘的win7系统安装
方法一用iso.需要下载个UltraISO软件安装. 制作64位WIN7系统U盘安装盘方法首页就有iso下载,有雨林木风等,我下载了系统之家最新的1907 U盘安装win7系统BIOS设置 thi ...
自然语言处理工具hanlp 1.7.3版本更新内容一览
HanLP 1.7.3 发布了.HanLP 是由一系列模型与算法组成的 Java 工具包,目标是普及自然语言处理在生产环境中的应用.HanLP 具备功能完善.性能高效.架构清晰.语料时新.可自定义的特 ...
Struts 2访问Servlet API
在servlet中可以通过servlet API来获取Session,在Struts中如何获取Session呢? 解析:将用户名放入session 两种方案 1. 与Servlet API解耦的访问方 ...
axios模块封装和分类列表实现
这个作用主要还是为了让代码更加的,清晰. 不要全部都放到 created(){} 这个方法下面.把这些代码全部抽离出去. 这里就只是去调用方法.1. src 目录下,新建文件夹--- rest ...
typescript中新增的基本数据类型
javascript中有7种数据类型,分别是:boolean,number,string,null,undefined和object,以及在es6中新增的一种类型 symbol.而typescript ...

hive不分区增量更新

hive不分区增量更新的更多相关文章

随机推荐

热门专题