HiBench成长笔记——(8) 分析源码workload_functions.sh
workload_functions.sh 是测试程序的入口,粘连了监控程序 monitor.py 和 主运行程序:
#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -u export HIBENCH_PRINTFULLLOG= this="${BASH_SOURCE-$0}" workload_func_bin=$(cd -P -- "$(dirname -- "$this")" && pwd -P) . ${workload_func_bin}/assert.sh . ${workload_func_bin}/color.sh HIBENCH_CONF_FOLDER=${HIBENCH_CONF_FOLDER:-${workload_func_bin}/../../conf} function enter_bench(){ # declare the entrance of a workload assert $ "Workload name not specified." assert $ "Workload config file not specified." assert $ "Current workload folder not specified." export HIBENCH_CUR_WORKLOAD_NAME=$ workload_config_file=$ workload_folder=$ patching_args=$@ echo "patching args=$patching_args" local CONF_FILE=`${workload_func_bin}/load_config.py ${HIBENCH_CONF_FOLDER} $workload_config_file $workload_folder $patching_args` . $CONF_FILE } function leave_bench(){ # declare the workload is finished assert $HIBENCH_CUR_WORKLOAD_NAME "BUG, HIBENCH_CUR_WORKLOAD_NAME unset." unset HIBENCH_CUR_WORKLOAD_NAME } function show_bannar(){ # print bannar assert $HIBENCH_CUR_WORKLOAD_NAME "HIBENCH_CUR_WORKLOAD_NAME not specified." assert $ "Unknown banner operation" echo -e "${BGreen}$1 ${Color_Off}${UGreen}$HIBENCH_CUR_WORKLOAD_NAME${Color_Off} ${BGreen}bench${Color_Off}" } function timestamp(){ # get current timestamp sec=`date +%s` nanosec=`date +%N` re='^[0-9]+$' if ! [[ $nanosec =~ $re ]] ; then $nanosec= fi tmp=` ` msec=` ` echo `expr $tmp + $msec` } function start_monitor(){ MONITOR_PID=`${workload_func_bin}/monitor.py ${HIBENCH_CUR_WORKLOAD_NAME} $$ ${WORKLOAD_RESULT_FOLDER}/monitor.log ${WORKLOAD_RESULT_FOLDER}/bench.log ${WORKLOAD_RESULT_FOLDER}/monitor.html ${SLAVES} &` # echo "start monitor, got child pid:${MONITOR_PID}" > /dev/stderr echo ${MONITOR_PID} } function stop_monitor(){ MONITOR_PID=$ assert $ "monitor pid missing" # echo "stop monitor, kill ${MONITOR_PID}" > /dev/stderr kill ${MONITOR_PID} } function get_field_name() { # print report column header printf "${REPORT_COLUMN_FORMATS}" Type Date Time Input_data_size "Duration(s)" "Throughput(bytes/s)" Throughput/node } function gen_report() { # dump the result to report file assert ${HIBENCH_CUR_WORKLOAD_NAME} "HIBENCH_CUR_WORKLOAD_NAME not specified." local start=$ local end=$ local size=$ >& ]; then assert "\"bc\" utility missing. Please install it to generate proper report." return fi local duration=$(echo "scale=3;($end-$start)/1000"|bc) local tput=`echo "$size/$duration"|bc` # local nodes=`>/dev/null | grep -v '^\s*$' | sed "/^#/ d" | wc -l` local nodes=`echo ${SLAVES} | wc -w` nodes=${nodes:-} ]; ; fi local tput_node=`echo "$tput/$nodes"|bc` REPORT_TITLE=`get_field_name` if [ ! -f ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME} ] ; then echo "${REPORT_TITLE}" > ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME} fi REPORT_LINE=$(printf "${REPORT_COLUMN_FORMATS}" ${HIBENCH_CUR_WORKLOAD_NAME} $(date +%F) $(date +%T) $size $duration $tput $tput_node) echo "${REPORT_LINE}" >> ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME} echo "# ${REPORT_TITLE}" >> ${HIBENCH_WORKLOAD_CONF} echo "# ${REPORT_LINE}" >> ${HIBENCH_WORKLOAD_CONF} } function rmr_hdfs(){ # rm -r for hdfs assert $ "dir parameter missing" RMDIR_CMD="fs -rm -r -skipTrash" local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR $RMDIR_CMD $1" >& execute_withlog ${CMD} } function upload_to_hdfs(){ assert $ "local parameter missing" assert $ "remote parameter missing" LOCAL_FILE_PATH=$ REMOTE_FILE_PATH=$ >& if [[ `echo $REMOTE_FILE_PATH | tr A-Z a-z` = hdfs://* ]]; then # strip leading "HDFS://xxx:xxx/" string >& local LEADING_HDFS_STRING_LENGTH=${#HDFS_MASTER} REMOTE_FILE_PATH=${REMOTE_FILE_PATH:$LEADING_HDFS_STRING_LENGTH} >& fi # clear previous package file local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -rm $REMOTE_FILE_PATH" >& execute_withlog ${CMD} # prepare parent folder CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -mkdir `dirname $REMOTE_FILE_PATH`" >& execute_withlog ${CMD} # upload CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -put $LOCAL_FILE_PATH $REMOTE_FILE_PATH" >& execute_withlog ${CMD} } function dus_hdfs(){ # du -s for hdfs assert $ "dir parameter missing" DUS_CMD="fs -du -s" local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR $DUS_CMD $1" >& execute_withlog ${CMD} } function check_dir() { # ensure dir is created local assert $ "dir parameter missing" if [ -z "$dir" ];then echo -e "${BYellow}WARN${Color_Off}: payload missing." return fi if [ ! -d "$dir" ];then echo -e "${BRed}ERROR${Color_Off}: directory $dir does not exist." exit fi touch "$dir"/touchtest ]; then echo -e "${BRed}ERROR${Color_Off}: directory unwritable." exit else rm "$dir"/touchtest fi } function dir_size() { ); do -]+$ ]]; then echo $item fi done } function run_spark_job() { LIB_JARS= while (($#)); do if [ "$1" = "--jars" ]; then LIB_JARS="--jars $2" continue fi break done CLS=$ shift export_withlog SPARKBENCH_PROPERTIES_FILES YARN_OPTS="" if [[ "$SPARK_MASTER" == yarn-* ]]; then export_withlog HADOOP_CONF_DIR YARN_OPTS="--num-executors ${YARN_NUM_EXECUTORS}" if [[ -n "${YARN_EXECUTOR_CORES:-}" ]]; then YARN_OPTS="${YARN_OPTS} --executor-cores ${YARN_EXECUTOR_CORES}" fi if [[ -n "${SPARK_YARN_EXECUTOR_MEMORY:-}" ]]; then YARN_OPTS="${YARN_OPTS} --executor-memory ${SPARK_YARN_EXECUTOR_MEMORY}" fi if [[ -n "${SPAKR_YARN_DRIVER_MEMORY:-}" ]]; then YARN_OPTS="${YARN_OPTS} --driver-memory ${SPARK_YARN_DRIVER_MEMORY}" fi fi if [[ "$CLS" == *.py ]]; then LIB_JARS="$LIB_JARS --jars ${SPARKBENCH_JAR}" SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --master ${SPARK_MASTER} ${YARN_OPTS} ${CLS} $@" else SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --class ${CLS} --master ${SPARK_MASTER} ${YARN_OPTS} ${SPARKBENCH_JAR} $@" fi echo -e "${BGreen}Submit Spark job: ${Green}${SUBMIT_CMD}${Color_Off}" MONITOR_PID=`start_monitor` execute_withlog ${SUBMIT_CMD} result=$? stop_monitor ${MONITOR_PID} ] then echo -e "${BRed}ERROR${Color_Off}: Spark job ${BYellow}${CLS}${Color_Off} failed to run successfully." echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n" tail ${WORKLOAD_RESULT_FOLDER}/bench.log exit $result fi } function run_storm_job(){ CMD="${STORM_HOME}/bin/storm jar ${STREAMBENCH_STORM_JAR} $@" echo -e "${BGreen}Submit Storm Job: ${Green}$CMD${Color_Off}" execute_withlog $CMD } function run_gearpump_app(){ CMD="${GEARPUMP_HOME}/bin/gear app -executors ${STREAMBENCH_GEARPUMP_EXECUTORS} -jar ${STREAMBENCH_GEARPUMP_JAR} $@" echo -e "${BGreen}Submit Gearpump Application: ${Green}$CMD${Color_Off}" execute_withlog $CMD } function run_flink_job(){ CMD="${FLINK_HOME}/bin/flink run -p ${STREAMBENCH_FLINK_PARALLELISM} -m ${HIBENCH_FLINK_MASTER} $@ ${STREAMBENCH_FLINK_JAR} ${SPARKBENCH_PROPERTIES_FILES}" echo -e "${BGreen}Submit Flink Job: ${Green}$CMD${Color_Off}" execute_withlog $CMD } function run_hadoop_job(){ ENABLE_MONITOR= if [ "$1" = "--without-monitor" ]; then ENABLE_MONITOR= fi local job_jar=$ shift local job_name=$ shift local tail_arguments=$@ local CMD="${HADOOP_EXECUTABLE} --config ${HADOOP_CONF_DIR} jar $job_jar $job_name $tail_arguments" echo -e "${BGreen}Submit MapReduce Job: ${Green}$CMD${Color_Off}" ]; then MONITOR_PID=`start_monitor` fi execute_withlog ${CMD} result=$? ]; then stop_monitor ${MONITOR_PID} fi ]; then echo -e "${BRed}ERROR${Color_Off}: Hadoop job ${BYellow}${job_jar} ${job_name}${Color_Off} failed to run successfully." echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n" tail ${WORKLOAD_RESULT_FOLDER}/bench.log exit $result fi } function ensure_hivebench_release(){ if [ ! -e ${HIBENCH_HOME}"/hadoopbench/sql/target/"$HIVE_RELEASE".tar.gz" ]; then assert "Error: The hive bin file hasn't be downloaded by maven, please check!" exit fi cd ${HIBENCH_HOME}"/hadoopbench/sql/target" if [ ! -d $HIVE_HOME ]; then tar zxf $HIVE_RELEASE".tar.gz" fi export_withlog HADOOP_EXECUTABLE } function ensure_mahout_release (){ if [ ! -e ${HIBENCH_HOME}"/hadoopbench/mahout/target/"$MAHOUT_RELEASE".tar.gz" ]; then assert "Error: The mahout bin file hasn't be downloaded by maven, please check!" exit fi cd ${HIBENCH_HOME}"/hadoopbench/mahout/target" if [ ! -d $MAHOUT_HOME ]; then tar zxf $MAHOUT_RELEASE".tar.gz" fi export_withlog HADOOP_EXECUTABLE export_withlog HADOOP_HOME export_withlog HADOOP_CONF_DIR } function execute () { CMD="$@" echo -e "${BCyan}Executing: ${Cyan}${CMD}${Color_Off}" $CMD } function printFullLog(){ export HIBENCH_PRINTFULLLOG= } function execute_withlog () { CMD="$@" ] ; then # Terminal, beautify the output. ${workload_func_bin}/execute_with_log.py ${WORKLOAD_RESULT_FOLDER}/bench.log $CMD else # pipe, do nothing. $CMD fi } function export_withlog () { var_name=$ var_val=${!} assert $ "export without a variable name!" echo -e "${BCyan}Export env: ${Cyan}${var_name}${BCyan}=${Cyan}${var_val}${Color_Off}" export ${var_name} } function command_exist () { result=$() ] then return else return fi } function ensure_nutchindexing_release () { if [ ! -e ${HIBENCH_HOME}"/hadoopbench/nutchindexing/target/apache-nutch-1.2-bin.tar.gz" ]; then assert "Error: The nutch bin file hasn't be downloaded by maven, please check!" exit fi NUTCH_ROOT=${WORKLOAD_RESULT_FOLDER} cp -a $NUTCH_DIR/nutch $NUTCH_ROOT cd ${HIBENCH_HOME}"/hadoopbench/nutchindexing/target" if [ ! -d $NUTCH_HOME ]; then tar zxf apache-nutch-1.2-bin.tar.gz fi find $NUTCH_HOME/lib ! -name "lucene-*" -type f -exec rm -rf {} \; rm -rf $NUTCH_ROOT/nutch_release cp -a $NUTCH_HOME $NUTCH_ROOT/nutch_release NUTCH_HOME_WORKLOAD=$NUTCH_ROOT/nutch_release cp $NUTCH_ROOT/nutch/conf/nutch-site.xml $NUTCH_HOME_WORKLOAD/conf cp $NUTCH_ROOT/nutch/bin/nutch $NUTCH_HOME_WORKLOAD/bin # Patching jcl-over-slf4j version against cdh or hadoop2 mkdir $NUTCH_HOME_WORKLOAD/temp unzip -q $NUTCH_HOME_WORKLOAD/nutch-1.2.job -d $NUTCH_HOME_WORKLOAD/temp rm -f $NUTCH_HOME_WORKLOAD/temp/lib/jcl-over-slf4j-*.jar rm -f $NUTCH_HOME_WORKLOAD/temp/lib/slf4j-log4j*.jar cp ${NUTCH_DIR}/target/dependency/jcl-over-slf4j-*.jar $NUTCH_HOME_WORKLOAD/temp/lib rm -f $NUTCH_HOME_WORKLOAD/nutch-1.2.job cd $NUTCH_HOME_WORKLOAD/temp zip -qr $NUTCH_HOME_WORKLOAD/nutch-1.2.job * rm -rf $NUTCH_HOME_WORKLOAD/temp echo $NUTCH_HOME_WORKLOAD } function prepare_sql_aggregation () { assert $ "SQL file path not exist" HIVEBENCH_SQL_FILE=$ >/dev/null cat <<EOF > ${HIVEBENCH_SQL_FILE} USE DEFAULT; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; set ${MAP_CONFIG_NAME}=$NUM_MAPS; set ${REDUCER_CONFIG_NAME}=$NUM_REDS; set hive.stats.autogather=false; DROP TABLE IF EXISTS uservisits; CREATE EXTERNAL TABLE uservisits (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits'; DROP TABLE IF EXISTS uservisits_aggre; CREATE EXTERNAL TABLE uservisits_aggre ( sourceIP STRING, sumAdRevenue DOUBLE) STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/uservisits_aggre'; INSERT OVERWRITE TABLE uservisits_aggre SELECT sourceIP, SUM(adRevenue) FROM uservisits GROUP BY sourceIP; EOF } function prepare_sql_join () { assert $ "SQL file path not exist" HIVEBENCH_SQL_FILE=$ >/dev/null cat <<EOF > ${HIVEBENCH_SQL_FILE} USE DEFAULT; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; set ${MAP_CONFIG_NAME}=$NUM_MAPS; set ${REDUCER_CONFIG_NAME}=$NUM_REDS; set hive.stats.autogather=false; DROP TABLE IF EXISTS rankings; CREATE EXTERNAL TABLE rankings (pageURL STRING, pageRank INT, avgDuration INT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/rankings'; DROP TABLE IF EXISTS uservisits_copy; CREATE EXTERNAL TABLE uservisits_copy (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits'; DROP TABLE IF EXISTS rankings_uservisits_join; CREATE EXTERNAL TABLE rankings_uservisits_join ( sourceIP STRING, avgPageRank DOUBLE, totalRevenue DOUBLE) STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/rankings_uservisits_join'; INSERT OVERWRITE TABLE rankings_uservisits_join SELECT sourceIP, avg(pageRank), AND datediff(UV.visitDate, )) NUV ON (R.pageURL = NUV.destURL) group by sourceIP order by totalRevenue DESC; EOF } function prepare_sql_scan () { assert $ "SQL file path not exist" HIVEBENCH_SQL_FILE=$ >/dev/null cat <<EOF > ${HIVEBENCH_SQL_FILE} USE DEFAULT; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; set ${MAP_CONFIG_NAME}=$NUM_MAPS; set ${REDUCER_CONFIG_NAME}=$NUM_REDS; set hive.stats.autogather=false; DROP TABLE IF EXISTS uservisits; CREATE EXTERNAL TABLE uservisits (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits'; DROP TABLE IF EXISTS uservisits_copy; CREATE EXTERNAL TABLE uservisits_copy (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/uservisits_copy'; INSERT OVERWRITE TABLE uservisits_copy SELECT * FROM uservisits; EOF }
HiBench成长笔记——(8) 分析源码workload_functions.sh的更多相关文章
- HiBench成长笔记——(11) 分析源码run.sh
#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor licen ...
- HiBench成长笔记——(10) 分析源码execute_with_log.py
#!/usr/bin/env python2 # Licensed to the Apache Software Foundation (ASF) under one or more # contri ...
- HiBench成长笔记——(9) 分析源码monitor.py
monitor.py 是主监控程序,将监控数据写入日志,并统计监控数据生成HTML统计展示页面: #!/usr/bin/env python2 # Licensed to the Apache Sof ...
- HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析
run.sh #!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributo ...
- memcached学习笔记——存储命令源码分析下篇
上一篇回顾:<memcached学习笔记——存储命令源码分析上篇>通过分析memcached的存储命令源码的过程,了解了memcached如何解析文本命令和mencached的内存管理机制 ...
- memcached学习笔记——存储命令源码分析上篇
原创文章,转载请标明,谢谢. 上一篇分析过memcached的连接模型,了解memcached是如何高效处理客户端连接,这一篇分析memcached源码中的process_update_command ...
- [转]【安卓笔记】AsyncTask源码剖析
[转][安卓笔记]AsyncTask源码剖析 http://blog.csdn.net/chdjj/article/details/39122547 前言: 初学AsyncTask时,就想研究下它的实 ...
- Hadoop学习笔记(10) ——搭建源码学习环境
Hadoop学习笔记(10) ——搭建源码学习环境 上一章中,我们对整个hadoop的目录及源码目录有了一个初步的了解,接下来计划深入学习一下这头神象作品了.但是看代码用什么,难不成gedit?,单步 ...
- 【Azure 应用服务】Azure Function App使用SendGrid发送邮件遇见异常消息The operation was canceled,分析源码逐步最终源端
问题描述 在使用Azure Function App的SendGrid Binging功能,调用SendGrid服务器发送邮件功能时,有时候遇见间歇性,偶发性异常.在重新触发SendGrid部分的Fu ...
随机推荐
- [原]NTP时钟同步服务设置
服务器列表 192.168.0.2 ntp服务端 192.168.0.3 ntp客户端 192.168.0.4 ntp客户端 192.168.0.5 ntp客户端 注:以下操作均以root操作 一.N ...
- linux文件或目录属性
wc(word count)命令的功能:统计指定文件的字节数.字数.行数.,并将统计结果显示输出 命令参数: -c 只显示字节数 -l 只显示行数 -w 只显示字数 od命令:查看二进制文件信息 ...
- redhat 7.6 流量监控命令、软件(3)nethogs 监控进程实时流量
1.解压nethogs tar -zxvpf nethogs_0.8.5.orig.tar.gz 2.直接make,这里报错,提示pcap.h,安装libpcap就可以了 3.如果已经安装,还是报错, ...
- Python 文件和目录操作学习
文件与文件路径 文件有两个关键属性:文件名和路径. 路径指明了文件在计算机上的位置. 文件名中,最后一个句点之后的部分称为文件的"扩展名",它指出了文件的类型 目录也叫文件夹,文件 ...
- 算法-leetcode-65-Valid Number
算法-leetcode-65-Valid Number 上代码: # coding:utf-8 __author__ = "sn" """Valida ...
- 六 Struts2访问Servlet的API方式一:完全解耦合的方式
注意: 完全解耦合的方式,这种方式只能获得代表request.session.application的数据的Map集合. 不能操作这些对象的本身的方法. 1 jsp: <%@ page lang ...
- Eclipse创建一个普通的java web项目
1.右键new ,选web project ,下一步 2.为项目命名,然后finish 3.然后将jar包复制到lib目录下, 4.就会自动将jar包编译到web app Libraries,项目创建 ...
- 51nod 1449:砝码称重
1449 砝码称重 题目来源: CodeForces 基准时间限制:1 秒 空间限制:131072 KB 分值: 40 难度:4级算法题 收藏 取消关注 现在有好多种砝码,他们的重量是 w0,w1 ...
- 深浅copy浅析
Python代码在开始执行的时候,代码会被系统从硬盘调入内存,等候CPU执行,至于怎么个调入逻辑,还不清楚. 在高级语言中,变量是对内存及其地址的抽象.也就是说变量就是内存地址. 那么我们先来介绍两种 ...
- js 判断时间大小
//判断结束时间一定要大于开始时间 function comparativeTime(){ var isok=true; //早餐配送时间 var breakfastScanTimeMin = $(& ...