HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析

run.sh

#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/sql/scan.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench ScalaSparkScan ${workload_config} ${current_dir}
show_bannar start

# prepare SQL
HIVEBENCH_SQL_FILE=${WORKLOAD_RESULT_FOLDER}/rankings_uservisits_scan.hive
prepare_sql_scan ${HIVEBENCH_SQL_FILE}

START_TIME=`timestamp`
rmr_hdfs $OUTPUT_HDFS
run_spark_job com.intel.hibench.sparkbench.sql.ScalaSparkSQLBench ScalaScan ${HIVEBENCH_SQL_FILE}
END_TIME=`timestamp`

SIZE=`dir_size $OUTPUT_HDFS`
gen_report ${START_TIME} ${END_TIME} ${SIZE:-}
show_bannar finish
leave_bench

workload_functions.sh

function run_spark_job() {
    LIB_JARS=
    while (($#)); do
      if [ "$1" = "--jars" ]; then
        LIB_JARS="--jars $2"

        continue
      fi
      break
    done

    CLS=$
    shift

    export_withlog SPARKBENCH_PROPERTIES_FILES

    YARN_OPTS=""
    if [[ "$SPARK_MASTER" == yarn-* ]]; then
        export_withlog HADOOP_CONF_DIR

        YARN_OPTS="--num-executors ${YARN_NUM_EXECUTORS}"
        if [[ -n "${YARN_EXECUTOR_CORES:-}" ]]; then
            YARN_OPTS="${YARN_OPTS} --executor-cores ${YARN_EXECUTOR_CORES}"
       fi
       if [[ -n "${SPARK_YARN_EXECUTOR_MEMORY:-}" ]]; then
           YARN_OPTS="${YARN_OPTS} --executor-memory ${SPARK_YARN_EXECUTOR_MEMORY}"
       fi
       if [[ -n "${SPAKR_YARN_DRIVER_MEMORY:-}" ]]; then
           YARN_OPTS="${YARN_OPTS} --driver-memory ${SPARK_YARN_DRIVER_MEMORY}"
       fi
    fi
    if [[ "$CLS" == *.py ]]; then
        LIB_JARS="$LIB_JARS --jars ${SPARKBENCH_JAR}"
        SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --master ${SPARK_MASTER} ${YARN_OPTS} ${CLS} $@"
    else
        SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --class ${CLS} --master ${SPARK_MASTER} ${YARN_OPTS} ${SPARKBENCH_JAR} $@"
    fi
    echo -e "${BGreen}Submit Spark job: ${Green}${SUBMIT_CMD}${Color_Off}"
    MONITOR_PID=`start_monitor`
    execute_withlog ${SUBMIT_CMD}
    result=$?
    stop_monitor ${MONITOR_PID}
     ]
    then
        echo -e "${BRed}ERROR${Color_Off}: Spark job ${BYellow}${CLS}${Color_Off} failed to run successfully."
        echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n"
        tail ${WORKLOAD_RESULT_FOLDER}/bench.log
        exit $result
    fi
}

ScalaSparkSQLBench.scala

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.hibench.sparkbench.sql

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext

/*
 * ported from HiBench's hive bench
 */
object ScalaSparkSQLBench{
  def main(args: Array[String]){
    if (args.length < 2){
      System.err.println(
        s"Usage: $ScalaSparkSQLBench <workload name> <SQL sciprt file>"
      )
      System.exit(1)
    }
    val workload_name = args(0)
    val sql_file = args(1)
    val sparkConf = new SparkConf().setAppName(workload_name)
    val sc = new SparkContext(sparkConf)
    val hc = new HiveContext(sc)

    val _sql = scala.io.Source.fromFile(sql_file).mkString
    _sql.split(';').foreach { x =>
      if (x.trim.nonEmpty)
        hc.sql(x)
    }

    sc.stop()
  }
}

HiveData.java

package HiBench;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.mapred.lib.NLineInputFormat;

public class HiveData {

    private static final Log log = LogFactory.getLog(HiveData.class.getName());

    private static final String RANKINGS = "rankings";
    private static final String USERVISITS = "uservisits";
    public static final String uagentf = "user_agents";
    public static final String countryf = "country_codes";
    public static final String searchkeyf = "search_keys";

    private DataOptions options;
    private long visits;

    // client side delim
    private String cdelim = ",";
    private int chashsize = 150 * 1024 * 1024;

    private Dummy dummy;

    HiveData(DataOptions options) {
        this.options = options;
        parseArgs(options.getRemainArgs());
    }

    private void parseArgs(String[] args) {

        for (int i=0; i<args.length; i++) {
            if ("-v".equals(args[i])) {
                visits = Long.parseLong(args[++i]);
            } else if ("-d".equals(args[i])) {
                cdelim = args[++i];
            } else {
                DataOptions.printUsage("Unknown hive data arguments -- " + args[i] + "!!!");
            }
        }

        if (chashsize > options.getNumPages()) {
            chashsize = (int) options.getNumPages();
        }

    }

    private void setRankingsOptions(JobConf job) throws URISyntaxException {
        job.setLong("pages", options.getNumPages());
        job.setLong("slotpages", options.getNumSlotPages());
        job.set("delimiter", cdelim);
        job.setInt("hashsize", chashsize);
        Utils.shareLinkZipfCore(options, job);
    }

    private void setVisitsOptions(JobConf job) {
        job.setInt("slots", options.getNumMaps());
        job.setLong("pages", options.getNumPages());
        job.setLong("visits", visits);
        job.set("delimiter", cdelim);
    }

    public static class DummyToRankingsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {

        private static final Log log = LogFactory.getLog(DummyToRankingsMapper.class.getName());

        private HtmlCore generator;
        private long pages, slotpages;
        private boolean outset;
        private OutputCollector<LongWritable, JoinBytesInt> myout;
        private JoinBytesInt uitem, ritem;
        private short[] hash;
        private HashMap<Integer, Integer> hm;
        private int hashsize;

        private void getOptions(JobConf job) {
            pages = job.getLong("pages", 0);
            slotpages = job.getLong("slotpages", 0);
            hashsize = job.getInt("hashsize", 0);
        }

        public void configure(JobConf job) {

            getOptions(job);

            try {
                generator = new HtmlCore(job);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            outset = false;
            myout = null;
            uitem = new JoinBytesInt();
            uitem.url = new byte[HtmlCore.getMaxUrlLength()];
            ritem = new JoinBytesInt();
            ritem.refs = 1;

            hash = new short[hashsize];
            hm = new HashMap<Integer, Integer>();
        }

        public void map(LongWritable key, Text value, OutputCollector<LongWritable, JoinBytesInt> output,
                Reporter reporter) throws IOException {

            if (!outset) {
                myout = output;
                outset = true;
            }

            int slotId = Integer.parseInt(value.toString().trim());
            generator.fireRandom(slotId);

            long[] range = HtmlCore.getPageRange(slotId, pages, slotpages);

            /**
             * For output collect
             */
            for (long i=range[0]; i<range[1]; i++) {
                key.set(i);

                generator.nextUrlJoinBytesInt(uitem);
                output.collect(key, uitem);

                long[] linkids = generator.genPureLinkIds();
                for (int j=0; j<linkids.length; j++) {
                    long uid = linkids[j];
                    if (uid < hashsize) {
                        int iid = (int) uid;
                        if (hash[iid]>=0) {
                            if (hash[iid]==HtmlCore.MAX_SHORT) {
                                hm.put(iid, (int) (hash[iid]) + 1);
                                hash[iid] = -1;
                            } else {
                                hash[iid]++;
                            }
                        } else {
                            hm.put(iid, hm.get(iid) + 1);
                        }
                    } else {
                        key.set(uid);
                        output.collect(key, ritem);
                    }
                }

                if (0==(i % 10000)) {
                    log.info("still running: " + (i - range[0]) + " of " + slotpages);
                }
            }
        }

        @Override
        public void close ()
        {
            try {
                LongWritable k = new LongWritable();
                for (int i=0; i<hash.length; i++) {
                    if (hash[i] > 0) {
                        k.set(i);
                        ritem.refs = hash[i];
                        myout.collect(k, ritem);
                    } else if (hash[i] < 0) {
                        k.set(i);
                        ritem.refs = hm.get(i);
                        myout.collect(k, ritem);
                    }
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    public static class JoinBytesIntCombiner extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, JoinBytesInt> {

//        Log log = null;
        JoinBytesInt item;

        @Override
        public void configure (JobConf job)
        {
            item = new JoinBytesInt();
//            log = LogFactory.getLog(JoinBytesIntCombiner.class.getName());
        }

        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            item.clear();
//            StringBuffer sb =  new StringBuffer("Combine: " + v.toString());
            while (values.hasNext()) {
                item.add(values.next());
//                sb.append("-> " + v.toString());
            }
            output.collect(key, item);
//            log.info(sb);
        }
    }

    public static class GenerateRankingsReducer extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, Text> {

        private static final Log log = LogFactory.getLog(GenerateRankingsReducer.class.getName());

        private Random rand;
        private int errors, missed;
        private JoinBytesInt v;
        private int pid;

        // job side delimiter
        private String delim;
//        private String missedids;

        public void configure (JobConf job)
        {
            delim = job.get("delimiter");
            pid = job.getInt("mapred.task.partition", 0);
            rand = new Random(pid + 1);

            v = new JoinBytesInt();

            errors = 0;
            missed = 0;
//            missedids = "";
        }

        public void close ()
        {
            log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed");
        }

        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {

            v.clear();
            while (values.hasNext()) {
                v.add(values.next());
            }

            if (0!=v.ulen) {
                if (v.refs > 0) {
                    Text value = new Text(
                            new String(v.url) +
                            delim +
                            v.refs +
                            delim +
                            (rand.nextInt(99) + 1)
                            );
                    output.collect(
                            key, value);

                    reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8+value.getLength());
                } else {
                    missed++;
                }
            } else {
                errors++;
            }
        }
    }

    private void createRankingsTableDirectly() throws IOException, URISyntaxException {

        log.info("Creating table rankings...");

        Path fout = new Path(options.getResultPath(), RANKINGS);

        JobConf job = new JobConf(HiveData.class);
        String jobname = "Create rankings";

        /** TODO: change another more effective way as this operation may cause
         *  about 2 min delay (originally ~15min in total)
         */
        setRankingsOptions(job);
        job.setJobName(jobname);
        job.set("mapred.reduce.slowstart.completed.maps", "0.3");
        job.set("mapreduce.job.reduce.slowstart.completedmaps", "0.3");

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(JoinBytesInt.class);

        job.setJarByClass(DummyToRankingsMapper.class);
        job.setJarByClass(JoinBytesIntCombiner.class);
        job.setJarByClass(GenerateRankingsReducer.class);

        job.setMapperClass(DummyToRankingsMapper.class);
        job.setCombinerClass(JoinBytesIntCombiner.class);
        job.setReducerClass(GenerateRankingsReducer.class);

        if (options.getNumReds() > 0) {
            job.setNumReduceTasks(options.getNumReds());
        } else {
            job.setNumReduceTasks(Utils.getMaxNumReds());
        }

        job.setInputFormat(NLineInputFormat.class);
        FileInputFormat.setInputPaths(job, dummy.getPath());

         job.set("mapred.map.output.compression.type", "BLOCK");
          job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
         MapFileOutputFormat.setCompressOutput(job, true);
//        MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.LzoCodec.class);
         MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.DefaultCodec.class);

        if (options.isSequenceOut()) {
            job.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            job.setOutputFormat(TextOutputFormat.class);
        }

        if (null != options.getCodecClass()) {
            job.set("mapred.output.compression.type","BLOCK");
             job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
        }

        FileOutputFormat.setOutputPath(job, fout);

        log.info("Running Job: " +jobname);
        log.info("Pages file " + dummy.getPath() + " as input");
        log.info("Rankings file " + fout + " as output");
        JobClient.runJob(job);
        log.info("Finished Running Job: " + jobname);
    }

    /***
     * Mapper to randomly create user visits. In map step, only the target
     * urls of user visits are created, the rest content of visits will be
     * created in reduce step
     * @author lyi2
     *
     */
    public static class DummyToAccessNoMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {

        private JoinBytesInt vitem;
        private long pages;
        private long slots;
        private long visits;

        // job side delimiter
        private String delim;
        private Visit visit;

        public void configure (JobConf job)
        {
            try {
                pages = job.getLong("pages", 0);
                slots = job.getLong("slots", 0);
                visits = job.getLong("visits", 0);
                delim = job.get("delimiter");

                visit = new Visit(DistributedCache.getLocalCacheFiles(job),
                        delim, pages);

                vitem = new JoinBytesInt();
                vitem.refs = 1;

            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter)
                        throws IOException {

            int slotId = Integer.parseInt(value.toString().trim());
            visit.fireRandom(slotId);

            for (long i=slotId; i<=visits;) {
                // simply setting url id is fine in map step
                key.set(visit.nextUrlId());
                output.collect(key, vitem);
                i = i + slots;
            }
        }
    }

    public static class SequenceRankingsToUrlsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {
        public JoinBytesInt uitem;

        public void configure(JobConf job) {
            uitem = new JoinBytesInt();
//            getBasicOptions(job);
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            uitem.url= value.toString().split(",")[0].getBytes();
            uitem.ulen = (byte) uitem.url.length;

            output.collect(key, uitem);
        }
    }

    public static class TextRankingsToUrlsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {
        public JoinBytesInt uitem;

        public void configure(JobConf job) {
            uitem = new JoinBytesInt();
//            getBasicOptions(job);
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            String[] items = value.toString().split("[,\t]");
            key.set(Long.parseLong(items[0]));
            uitem.url= items[1].getBytes();
            uitem.ulen = (byte) uitem.url.length;

            output.collect(key, uitem);
        }
    }

    public static class CreateUserVisitsReducer extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, Text> {

        private static final Log log = LogFactory.getLog(CreateUserVisitsReducer.class.getName());

        private long pages;
        private Visit visit;

        private int errors, missed;
        private JoinBytesInt vitem;

        // job side delimiter
        private String delim;
        private int pid;

        public void configure (JobConf job)
        {
            try {
                pages = job.getLong("pages", 0);
                delim = job.get("delimiter");
                pid = job.getInt("mapred.task.partition", 0);

                visit = new Visit(DistributedCache.getLocalCacheFiles(job),
                        delim, pages);
                visit.fireRandom(pid + 1);

                vitem = new JoinBytesInt();

                errors = 0;
                missed = 0;

            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        public void close ()
        {
            log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed");
        }

        /**
         * Reduce: to sum up the record sizes (of slots) one by one so that to determine the
         * corresponding start point to hold the records for each slot.
         */
        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {

            vitem.clear();
//            StringBuffer sb = new StringBuffer("Reduce: " + v.toString());
            while (values.hasNext()) {
                vitem.add(values.next());
//                sb.append("-> " + v.toString());
            }
//            log.info(sb);

            if (0!=vitem.ulen) {
                if (vitem.refs > 0) {
                    for (int i=0; i<vitem.refs; i++) {
                        Text value = new Text(visit.nextAccess(new String(vitem.url)));
                        output.collect(key, value);
                        reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8+value.getLength());
                    }
                } else {
                    missed++;
                }
            } else {
                errors++;
            }
        }
    }

    private void createUserVisitsTableDirectly() throws IOException, URISyntaxException {

        log.info("Creating user visits...");

        Path rankings = new Path(options.getResultPath(), RANKINGS);
        Path fout = new Path(options.getResultPath(), USERVISITS);

        JobConf job = new JobConf(HiveData.class);
        String jobname = "Create uservisits";
        job.setJobName(jobname);
        setVisitsOptions(job);

        /***
         * Set distributed cache file for table generation,
         * cache files include:
         * 1. user agents
         * 2. country code and language code
         * 3. search keys
         */

        Path uagentPath = new Path(options.getWorkPath(), uagentf);
        DistributedCache.addCacheFile(uagentPath.toUri(), job);

        Path countryPath = new Path(options.getWorkPath(), countryf);
        DistributedCache.addCacheFile(countryPath.toUri(), job);

        Path searchkeyPath = new Path(options.getWorkPath(), searchkeyf);
        DistributedCache.addCacheFile(searchkeyPath.toUri(), job);

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(JoinBytesInt.class);

        MultipleInputs.addInputPath(job, dummy.getPath(),
                NLineInputFormat.class, DummyToAccessNoMapper.class);

        if (options.isSequenceOut()) {
            MultipleInputs.addInputPath(job, rankings,
                    SequenceFileInputFormat.class, SequenceRankingsToUrlsMapper.class);
        } else {
            MultipleInputs.addInputPath(job, rankings,
                    TextInputFormat.class, TextRankingsToUrlsMapper.class);
        }

        job.setCombinerClass(JoinBytesIntCombiner.class);
        job.setReducerClass(CreateUserVisitsReducer.class);

        if (options.getNumReds() > 0) {
            job.setNumReduceTasks(options.getNumReds());
        } else {
            job.setNumReduceTasks(Utils.getMaxNumReds());
        }

//        job.setNumReduceTasks(options.slots/2);

        if (options.isSequenceOut()) {
            job.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            job.setOutputFormat(TextOutputFormat.class);
        }

        if (null != options.getCodecClass()) {
            job.set("mapred.output.compression.type","BLOCK");
                        job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
        }

        FileOutputFormat.setOutputPath(job, fout);

        log.info("Running Job: " +jobname);
        log.info("Dummy file " + dummy.getPath() + " as input");
        log.info("Rankings file " + rankings + " as input");
        log.info("Ouput file " + fout);
        JobClient.runJob(job);
        log.info("Finished Running Job: " + jobname);
    }

    public void generate() throws Exception {

        log.info("Generating hive data files...");
        init();

        createRankingsTableDirectly();
        createUserVisitsTableDirectly();

        close();
    }

    public void loadFiles() throws IOException {
        RawData.createSearchKeys(new Path(options.getWorkPath(), searchkeyf));
        RawData.createUserAgents(new Path(options.getWorkPath(), uagentf));
        RawData.createCCodes(new Path(options.getWorkPath(), countryf));
    }

    private void init() throws IOException {

        log.info("Initializing hive date generator...");

        Utils.checkHdfsPath(options.getResultPath(), true);
        Utils.checkHdfsPath(options.getWorkPath(), true);

        loadFiles();

        Utils.serialLinkZipf(options);

        dummy = new Dummy(options.getWorkPath(), options.getNumMaps());
    }

    public void close() throws IOException {

        log.info("Closing hive data generator...");
        Utils.checkHdfsPath(options.getWorkPath());
    }
}

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析的更多相关文章

第八篇：Spark SQL Catalyst源码分析之UDF
/** Spark SQL源码分析系列文章*/ 在SQL的世界里,除了官方提供的常用的处理函数之外,一般都会提供可扩展的对外自定义函数接口,这已经成为一种事实的标准. 在前面Spark SQL源码分析 ...
第五篇：Spark SQL Catalyst源码分析之Optimizer
/** Spark SQL源码分析系列文章*/ 前几篇文章介绍了Spark SQL的Catalyst的核心运行流程.SqlParser,和Analyzer 以及核心类库TreeNode,本文将详细讲解 ...
第六篇：Spark SQL Catalyst源码分析之Physical Plan
/** Spark SQL源码分析系列文章*/ 前面几篇文章主要介绍的是spark sql包里的的spark sql执行流程,以及Catalyst包内的SqlParser,Analyzer和Optim ...
第四篇：Spark SQL Catalyst源码分析之TreeNode Library
/** Spark SQL源码分析系列文章*/ 前几篇文章介绍了Spark SQL的Catalyst的核心运行流程.SqlParser,和Analyzer,本来打算直接写Optimizer的,但是发现 ...
第三篇：Spark SQL Catalyst源码分析之Analyzer
/** Spark SQL源码分析系列文章*/ 前面几篇文章讲解了Spark SQL的核心执行流程和Spark SQL的Catalyst框架的Sql Parser是怎样接受用户输入sql,经过解析生成 ...
第二篇：Spark SQL Catalyst源码分析之SqlParser
/** Spark SQL源码分析系列文章*/ Spark SQL的核心执行流程我们已经分析完毕,可以参见Spark SQL核心执行流程,下面我们来分析执行流程中各个核心组件的工作职责. 本文先从入口 ...
【原】Spark中Client源码分析（二）
继续前一篇的内容.前一篇内容为: Spark中Client源码分析(一)http://www.cnblogs.com/yourarebest/p/5313006.html DriverClient中的 ...
【原】Spark中Master源码分析（二）
继续上一篇的内容.上一篇的内容为: Spark中Master源码分析(一) http://www.cnblogs.com/yourarebest/p/5312965.html 4.receive方法, ...
【原】 Spark中Worker源码分析（二）
继续前一篇的内容.前一篇内容为: Spark中Worker源码分析(一)http://www.cnblogs.com/yourarebest/p/5300202.html 4.receive方法, r ...
Spark Scheduler模块源码分析之TaskScheduler和SchedulerBackend
本文是Scheduler模块源码分析的第二篇,第一篇Spark Scheduler模块源码分析之DAGScheduler主要分析了DAGScheduler.本文接下来结合Spark-1.6.0的源码继 ...

随机推荐

01背包第k最优解
附题目链接:Bone Collector II Input The first line contain a integer T , the number of cases.Followed by T ...
大数据篇：YARN
YARN YARN是什么? YARN是一种新的 Hadoop 资源管理器,它是一个通用资源管理系统,可为上层应用提供统一的资源管理和调度,它的引入为集群在利用率.资源统一管理和数据共享等方面带来了巨大 ...
软件架构，WEB - MVC，MVP，MVVM
参考 https://www.zhihu.com/question/20148405/answer/107071448 http://www.cnblogs.com/indream/p/3602348 ...
误删Django的model中的表解决办法
误删Django的model中的表解决办法 1.model里面的表格实际的操作都在migrations文件夹中,里面记录了操作过程,当在database和model中删除表格时要注意初始化数据库时会报 ...
Python 使用pillow 操作图像
文档:https://pillow.readthedocs.io/en/stable/index.html 计算机图像基础颜色和RGBA值计算机程序通常将图像中的颜色表示为 RGBA 值.RGBA ...
2.9 logistic回归中的梯度下降法（非常重要，一定要重点理解）
怎么样计算偏导数来实现logistic回归的梯度下降法它的核心关键点是其中的几个重要公式用来实现logistic回归的梯度下降法接下来开始学习logistic回归的梯度下降法 logistic回归 ...
JDBC 创建连接对象的三种方式、 properties文件的建立、编辑和信息获取
创建连接对象的三种方式 //第一种方式 Connection conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ ...
Eclipse创建一个普通的java web项目
1.右键new ,选web project ,下一步 2.为项目命名,然后finish 3.然后将jar包复制到lib目录下, 4.就会自动将jar包编译到web app Libraries,项目创建 ...
linux中cp指令前面加反斜杠
在cp指令前面加反斜杠可以不弹出是否覆盖的询问而直接覆盖! 如:cp /app/WEB-INF/com/cfg.properties /app_bak/WEB-INF/com/cfg.properti ...
第一个Vue-cli创建项目
需要环境: Node.js:http://nodejs.cn/download/ 安装完成之后,使用cmd测试: 我现在的是最新的安装Node.js加速器: 这个下载的稍微慢一些 npm insta ...

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析的更多相关文章

随机推荐

热门专题