http://52.11.56.155:7180/
http://52.11.56.155:8888/

impala-shell -i 172.31.25.244

sudo -u hdfs hdfs dfs -du -h /

http://54.149.20.119:25000/

第一步:
准备工作
生成tpc-h测试数据
在TPC-H的官网http://www.tpc.org/tpch/上下载dbgen工具http://www.tpc.org/tpch/spec/tpch_2_17_0.zip
[root@ip-172-31-25-243 training]# wget http://www.tpc.org/tpch/spec/tpch_2_17_0.zip

确认下载后的结果:
[root@ip-172-31-25-243 training]# ls
tpch_2_17_0.zip

如果没有安装解压工具,请安装解压工具
[root@ip-172-31-25-243 training]# yum install unzip
解压tpc-h数据生成工具,确认加压结果:
[root@ip-172-31-25-243 training]# unzip tpch_2_17_0.zip
[root@ip-172-31-25-243 training]# ls
__MACOSX tpch_2_17_0 tpch_2_17_0.zip

切换到相应目录并修改参数
[root@ip-172-31-25-243 training]# cd tpch_2_17_0
[root@ip-172-31-25-243 tpch_2_17_0]# ls
dbgen dev-tools ref_data
[root@ip-172-31-25-243 tpch_2_17_0]# cd dbgen
[root@ip-172-31-25-243 dbgen]# ls
answers bm_utils.c check_answers dbgen.dsp dss.ddl dsstypes.h makefile.suite PORTING.NOTES qgen.vcproj reference rnd.h shared.h text.c tpch.sln variants
bcd2.c BUGS column_split.sh dists.dss dss.h HISTORY permute.c print.c queries release.h rng64.c speed_seed.c tpcd.h tpch.vcproj varsub.c
bcd2.h build.c config.h driver.c dss.ri load_stub.c permute.h qgen.c README rnd.c rng64.h tests tpch.dsw update_release.sh

[root@ip-172-31-25-243 dbgen]# cp makefile.suite makefile
[root@ip-172-31-25-243 dbgen]# vi makefile
将其中的如下几行:
CC =
DATABASE=
MACHINE =
WORKLOAD =
修改成:
CC = gcc
DATABASE= ORACLE
MACHINE = LINUX
WORKLOAD = TPCH

构建生成工具
[root@ip-172-31-25-243 dbgen]# make
查看结果
[root@ip-172-31-25-243 dbgen]# ls
answers bm_utils.c build.o dbgen driver.o dsstypes.h makefile permute.o qgen queries rnd.c rng64.h speed_seed.o tpcd.h update_release.sh
bcd2.c bm_utils.o check_answers dbgen.dsp dss.ddl HISTORY makefile.suite PORTING.NOTES qgen.c README rnd.h rng64.o tests tpch.dsw variants
bcd2.h BUGS column_split.sh dists.dss dss.h load_stub.c permute.c print.c qgen.o reference rnd.o shared.h text.c tpch.sln varsub.c
bcd2.o build.c config.h driver.c dss.ri load_stub.o permute.h print.o qgen.vcproj release.h rng64.c speed_seed.c text.o tpch.vcproj varsub.o
发现多了两个两个可执行文件dbgen和ggen

切换到dbgen目录下并运行生生数据程序
[root@ip-172-31-25-243 dbgen]# ./dbgen -s 10

查看运行结果:
[root@ip-172-31-25-243 dbgen]# ls *.tbl
customer.tbl lineitem.tbl nation.tbl orders.tbl partsupp.tbl part.tbl region.tbl supplier.tbl

第二步:
将数据写入hdfs
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/customer
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/lineitem
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/nation
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/orders
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/part
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/partsupp
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/region
[root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/supplier
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put customer.tbl /user/training/tpch/customer
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put lineitem.tbl /user/training/tpch/lineitem
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put nation.tbl /user/training/tpch/nation
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put orders.tbl /user/training/tpch/orders
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put part.tbl /user/training/tpch/part
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put partsupp.tbl /user/training/tpch/partsupp
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put region.tbl /user/training/tpch/region
[root@ip-172-31-25-243 dbgen]# hdfs dfs -put supplier.tbl /user/training/tpch/supplier
大家可以看一下数据大小以及数据的分布情况,每个表的大小

第三步:
进入hue界面
http://52.11.56.155:8888/
用户名admin,密码admin
进入Query Editors —> Hive
同时打开另一个标签页,进入进入Query Editors —> Impala

创建数据库和表
—-创建数据库
在hive标签页中输入
create database training

在界面左边的DataBase部分,点击刷新之后就可以看到新建的数据库training
选中该数据库,然后再右边的编辑窗口中输入以下建表语句:

在hive标签页执行
-- create tables and load data
create external table customer (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/customer';
create external table lineitem (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/lineitem';
create external table nation (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/nation';
create external table orders (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/orders';
create external table part (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/part';
create external table partsupp (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION'/user/training/tpch/partsupp';
create external table region (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/region';
create external table supplier (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/supplier';

-- create orc tables;
create table customer_orc (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) STORED AS orc;
create table lineitem_orc (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS orc;
create table nation_orc (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) STORED AS orc;
create table orders_orc (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) STORED AS orc;
create table part_orc (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) STORED AS orc;
create table partsupp_orc (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) STORED AS orc;
create table region_orc (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) STORED AS orc;
create table supplier_orc (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) STORED AS orc;

在Impala标签页执行

invalidate metadata

然后点击屏幕左边的刷新按钮,可以看到刚才在hive中建立的database training.
-- create parquet tables
create table customer_parquet (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) STORED AS parquet;
create table lineitem_parquet (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS parquet;
create table nation_parquet (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) STORED AS parquet;
create table orders_parquet (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) STORED AS parquet;
create table part_parquet (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) STORED AS parquet;
create table partsupp_parquet (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) STORED AS parquet;
create table region_parquet (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) STORED AS parquet;
create table supplier_parquet (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) STORED AS parquet;

—hive: load data to orc format
insert overwrite table lineitem_orc select * from lineitem;
insert overwrite table customer_orc select * from customer;
insert overwrite table nation_orc select * from nation;
insert overwrite table orders_orc select * from orders;
insert overwrite table part_orc select * from part;
insert overwrite table partsupp_orc select * from partsupp;
insert overwrite table region_orc select * from region;
insert overwrite table supplier_orc select * from supplier;

—impala: load data to parquet format
insert overwrite table lineitem_parquet select * from lineitem;
insert overwrite table customer_parquet select * from customer;
insert overwrite table nation_parquet select * from nation;
insert overwrite table orders_parquet select * from orders;
insert overwrite table part_parquet select * from part;
insert overwrite table partsupp_parquet select * from partsupp;
insert overwrite table region_parquet select * from region;
insert overwrite table supplier_parquet select * from supplier;

—impala compute stats
compute stats customer;
compute stats lineitem;
compute stats nation;
compute stats orders;
compute stats part;
compute stats partsupp;
compute stats region;
compute stats supplier;

compute stats customer_parquet;
compute stats lineitem_parquet;
compute stats nation_parquet;
compute stats orders_parquet;
compute stats part_parquet;
compute stats partsupp_parquet;
compute stats region_parquet;
compute stats supplier_parquet;

第四步:
--Query1:单表统计分析

--hive

-- hive+text
SELECT
L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
FROM
lineitem
WHERE
L_SHIPDATE<='1998-09-02'
GROUP BY L_RETURNFLAG, L_LINESTATUS
ORDER BY L_RETURNFLAG, L_LINESTATUS;

-- hive+orc
SELECT
L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
FROM
lineitem_orc
WHERE
L_SHIPDATE<='1998-09-02'
GROUP BY L_RETURNFLAG, L_LINESTATUS
ORDER BY L_RETURNFLAG, L_LINESTATUS;

-- impala+text
SELECT
L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
FROM
lineitem
WHERE
L_SHIPDATE<='1998-09-02'
GROUP BY L_RETURNFLAG, L_LINESTATUS
ORDER BY L_RETURNFLAG, L_LINESTATUS;

--impala+parquet
SELECT
L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int)
FROM
lineitem_parquet
WHERE
L_SHIPDATE<='1998-09-02'
GROUP BY L_RETURNFLAG, L_LINESTATUS
ORDER BY L_RETURNFLAG, L_LINESTATUS;

第五步:
--Query2:多表联合查询
--hive+text
select
l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
from
customer c join orders o
on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
join lineitem l
on l.l_orderkey = o.o_orderkey
where
o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
group by l_orderkey, o_orderdate, o_shippriority
order by revenue desc, o_orderdate
limit 10;

--hive+orc
select
l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
from
customer_orc c join orders_orc o
on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
join lineitem_orc l
on l.l_orderkey = o.o_orderkey
where
o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
group by l_orderkey, o_orderdate, o_shippriority
order by revenue desc, o_orderdate
limit 10;

--impala+text
select
l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
from
customer c join orders o
on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
join lineitem l
on l.l_orderkey = o.o_orderkey
where
o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
group by l_orderkey, o_orderdate, o_shippriority
order by revenue desc, o_orderdate
limit 10;

--impala+parquet
select
l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
from
customer_parquet c join orders_parquet o
on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
join lineitem_parquet l
on l.l_orderkey = o.o_orderkey
where
o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
group by l_orderkey, o_orderdate, o_shippriority
order by revenue desc, o_orderdate
limit 10;

—-hdfs
数据的块分布

--hive演示要点:
1. Hive的Shell执行方式和Hue界面对比
2. Hive建表语句,Hive查询,统计分析,多表关联查询和统计
3. Hive的运行原理
4. Hive的SQL执行速度问题
5. 系统资源使用情况
6. 并发访问的问题,资源使用

--impala演示
1. impala shell与Hue
2. impala与hive执行速度的对比
3. orc,parquet的压缩比
4. parquet的运行效率提升
5. 系统资源使用情况
6. impala的运行原理
7. impala运行时统计信息,impala的诊断和调优

CIB Training Scripts For TPC-H Benchmark的更多相关文章

  1. 【error】scripts/basic/fixdep: Syntax error: "(" unexpected

    前言 第一次安装PCIE驱动的时候容易出现各种问题,总结一下下.. 原因分析 一般情况下,直接make的时候会出现问题. scripts/basic/fixdep: : scripts/basic/f ...

  2. [io benchmark]常用磁盘基准/压力测试工具

    Unix Disk I/O Benchmarks fio - NEW! fio is an I/O tool meant to be used both for benchmark and stres ...

  3. faceswap使用手冊

    cd faceswap cd faceswap-master python faceswap.py gui Notice: This repository is not operated or mai ...

  4. 探究MySQL MGR的读写分离

    1:现有环境 机器 MySQL 环境 172.16.128.240 MGR NODE1 MGR 172.16.128.241 MGR NODE2 MGR 172.16.128.242 MGR NODE ...

  5. mysql性能测试-tpcc

    mysql性能测试-tpcc Tpcc-mysql TPC-C是专门针对联机交易处理系统(OLTP系统)的规范 Tpcc-mysql由percona根据规范实现 TPCC流程  更能模拟线上业务   ...

  6. bert 硬件要求

    https://github.com/google-research/bert BERT ***** New May 31st, 2019: Whole Word Masking Models *** ...

  7. [转]Dynamic SQL & Stored Procedure Usage in T-SQL

    转自:http://www.sqlusa.com/bestpractices/training/scripts/dynamicsql/ Dynamic SQL & Stored Procedu ...

  8. (转)The AlphaGo Replication Wiki

    The AlphaGo Replication Wiki 摘自:https://github.com/Rochester-NRT/RocAlphaGo/wiki/01.-Home Contents : ...

  9. [HIve - LanguageManual] Join Optimization (不懂)

    Join Optimization Join Optimization Improvements to the Hive Optimizer Star Join Optimization Star S ...

随机推荐

  1. hihoCoder 1432 : JiLi Number(吉利数)

    hihoCoder #1432 : JiLi Number(吉利数) 时间限制:1000ms 单点时限:1000ms 内存限制:256MB Description - 题目描述 Driver Ji l ...

  2. Jenkins console输出乱码???

    jenkins console输出乱码??? 在jenkins中job执行shell,console中出现乱码如下: [WARNING] /home/mtime/data/jenkins/worksp ...

  3. python走起之第十三话

    前景介绍 到目前为止,很多公司对堡垒机依然不太感冒,其实是没有充分认识到堡垒机在IT管理中的重要作用的,很多人觉得,堡垒机就是跳板机,其实这个认识是不全面的,跳板功能只是堡垒机所具备的功能属性中的其中 ...

  4. Scrum Meeting 12-20151218

    任务安排 姓名 今日任务 明日任务 困难 董元财 无 网络连接框架优化 无 胡亚坤 无 修复商品列表排列混乱) 无 刘猛 无 无 马汉虎 无 无 赖彦俞 无 无 燃尽图 团队照片 代码签入记录 今天开 ...

  5. WPF感悟

    WPF感悟 UI层与逻辑层要尽可能地剥离(解耦). Routed Event和Command比Event的耦合度要低. UI层与逻辑层的“血管”是数据关联(Data Binding). 尽量不要试图通 ...

  6. 转:最近5年133个Java面试问题列表

    最近5年133个Java面试问题列表 Java 面试随着时间的改变而改变.在过去的日子里,当你知道 String 和 StringBuilder 的区别就能让你直接进入第二轮面试,但是现在问题变得越来 ...

  7. BZOJ4247挂饰

    Description     JOI君有N个装在手机上的挂饰,编号为1...N. JOI君可以将其中的一些装在手机上.     JOI君的挂饰有一些与众不同--其中的一些挂饰附有可以挂其他挂件的挂钩 ...

  8. union内嵌struct用法

      // union内嵌struct用法 // 众所周知,union为联合体,struct为结构体.下面根据实例谈谈用法   #include <stdio.h>   #include & ...

  9. WORD 粘贴代码 不检查语法

  10. java, mybatis, 调用mysql存储过程

    Map<String, Object> bindinfo = new HashMap<String, Object>();            bindinfo.put(&q ...