spark源码分析, 任务反序列化及执行

1 ==> 接受消息,org.apache.spark.executor.CoarseGrainedExecutorBackend#receive

    case LaunchTask(data) =>

      if (executor == null) {

        exitExecutor(1, "Received LaunchTask command but executor was null")

      } else {

        val taskDesc = TaskDescription.decode(data.value)

        logInfo("Got assigned task " + taskDesc.taskId)

        executor.launchTask(this, taskDesc)

      }

2. ==> org.apache.spark.executor.Executor#launchTask

  // Maintains the list of running tasks.

  private val runningTasks = new ConcurrentHashMap[Long, TaskRunner]

 def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {

    val tr = new TaskRunner(context, taskDescription)

    runningTasks.put(taskDescription.taskId, tr)

    threadPool.execute(tr)

  }

3. ==>org.apache.spark.executor.Executor.TaskRunner#run

override def run(): Unit = {

      threadId = Thread.currentThread.getId

      Thread.currentThread.setName(threadName)

      val threadMXBean = ManagementFactory.getThreadMXBean

      val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)

//下载依赖

        updateDependencies(taskDescription.addedFiles, taskDescription.addedJars)
//反序列化得到真正的 task

        task = ser.deserialize[Task[Any]](taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)

        task.localProperties = taskDescription.properties

        task.setTaskMemoryManager(taskMemoryManager)

    val value = Utils.tryWithSafeFinally {

              val res = task.run(

                taskAttemptId = taskId,

                attemptNumber = taskDescription.attemptNumber,

                metricsSystem = env.metricsSystem)

              threwException = false

              res

            } {

              val releasedLocks = env.blockManager.releaseAllLocksForTask(taskId)

              val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()

            }

    //处理执行结果

    val resultSer = env.serializer.newInstance()

    val beforeSerialization = System.currentTimeMillis()

    val valueBytes = resultSer.serialize(value)

    val afterSerialization = System.currentTimeMillis()       

    // Note: accumulator updates must be collected after TaskMetrics is updated

    val accumUpdates = task.collectAccumulatorUpdates()

    // TODO: do not serialize value twice

    val directResult = new DirectTaskResult(valueBytes, accumUpdates)

    val serializedDirectResult = ser.serialize(directResult)

    val resultSize = serializedDirectResult.limit()

    // directSend = sending directly back to the driver

    val serializedResult: ByteBuffer = {

      if (maxResultSize > 0 && resultSize > maxResultSize) {

        logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " +

          s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " +

          s"dropping it.")

        ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))

      } else if (resultSize > maxDirectResultSize) {

        val blockId = TaskResultBlockId(taskId)

        env.blockManager.putBytes(

          blockId,

          new ChunkedByteBuffer(serializedDirectResult.duplicate()),

          StorageLevel.MEMORY_AND_DISK_SER)

        logInfo(

          s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")

        ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))

      } else {

        logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")

        serializedDirectResult

      }

    }

    setTaskFinishedAndClearInterruptStatus()

    execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)

}

==> org.apache.spark.executor.Executor#updateDependencies

  /**

   * Download any missing dependencies if we receive a new set of files and JARs from the

   * SparkContext. Also adds any new JARs we fetched to the class loader.

   */

  private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]) {

    lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)

    synchronized {

      // Fetch missing dependencies

      for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {

        logInfo("Fetching " + name + " with timestamp " + timestamp)

        // Fetch file with useCache mode, close cache for local mode.

        Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,

          env.securityManager, hadoopConf, timestamp, useCache = !isLocal)

        currentFiles(name) = timestamp

      }

      for ((name, timestamp) <- newJars) {

        val localName = new URI(name).getPath.split("/").last

        val currentTimeStamp = currentJars.get(name)

          .orElse(currentJars.get(localName))

          .getOrElse(-1L)

        if (currentTimeStamp < timestamp) {

          logInfo("Fetching " + name + " with timestamp " + timestamp)

          // Fetch file with useCache mode, close cache for local mode.

          Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,

            env.securityManager, hadoopConf, timestamp, useCache = !isLocal)

          currentJars(name) = timestamp

          // Add it to our class loader

          val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL

          if (!urlClassLoader.getURLs().contains(url)) {

            logInfo("Adding " + url + " to class loader")

            urlClassLoader.addURL(url)

          }

        }

      }

    }

  }

==> org.apache.spark.scheduler.Task#run

 final def run(

      taskAttemptId: Long,

      attemptNumber: Int,

      metricsSystem: MetricsSystem): T = {

    SparkEnv.get.blockManager.registerTask(taskAttemptId)

    val taskContext = new TaskContextImpl(

      stageId,

      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal

      partitionId,

      taskAttemptId,

      attemptNumber,

      taskMemoryManager,

      localProperties,

      metricsSystem,

      metrics)

    context = if (isBarrier) {

      new BarrierTaskContext(taskContext)

    } else {

      taskContext

    }

    TaskContext.setTaskContext(context)

    taskThread = Thread.currentThread()

    if (_reasonIfKilled != null) {

      kill(interruptThread = false, _reasonIfKilled)

    }

    new CallerContext(

      "TASK",

      SparkEnv.get.conf.get(APP_CALLER_CONTEXT),

      appId,

      appAttemptId,

      jobId,

      Option(stageId),

      Option(stageAttemptId),

      Option(taskAttemptId),

      Option(attemptNumber)).setCurrentContext()

    try {

    //这个类只是一个模板类或者抽象类, 具体实现类分为ResultTask, ShuffleMapTask 两种

      runTask(context)

    }

  }

==>org.apache.spark.scheduler.ShuffleMapTask#runTask

ShuffleMapTask将rdd的元素，切分为多个bucket, 基于ShuffleDependency指定的partitioner，默认就是HashPartitioner

ShuffleMapTask 核心方法是 RDD.iterator[底层调用 compute 方法(fn(context,index,partition))],

执行完成rdd之后，rdd或返回处理过后的partition数据，这些数据通过shuffleWriter在经过HashPartitioner写入对应的分区中

// ShuffleMapTask将rdd的元素，切分为多个bucket

// 基于ShuffleDependency指定的partitioner，默认就是HashPartitioner

private[spark] class ShuffleMapTask(

   ...

   // ShuffleMapTask的 runTask 有 MapStatus返回值

  override def runTask(context: TaskContext): MapStatus = {

    // Deserialize the RDD using the broadcast variable.

    val threadMXBean = ManagementFactory.getThreadMXBean

    val deserializeStartTime = System.currentTimeMillis()

    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {

      threadMXBean.getCurrentThreadCpuTime

    } else 0L

    // 对task要处理的数据，做反序列化操作

    val ser = SparkEnv.get.closureSerializer.newInstance()

    //获得 RDD

    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](

      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {

      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime

    } else 0L

    var writer: ShuffleWriter[Any, Any] = null

    try {

      // 拿到shuffleManager

      val manager = SparkEnv.get.shuffleManager

      // 拿到shuffleWriter

      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)

      // 核心逻辑，调用rdd的iterator方法，并且传入了当前要处理的partition

      // 执行完成rdd之后，rdd或返回处理过后的partition数据，这些数据通过shuffleWriter

      // 在经过HashPartitioner写入对应的分区中

      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])

      // 返回结果 MapStatus ，里面封装了ShuffleMapTask存储在哪里，其实就是BlockManager相关信息

      writer.stop(success = true).get

    }

  }

  ...

}

==> org.apache.spark.scheduler.ResultTask#runTask

  override def runTask(context: TaskContext): U = {

    // Deserialize the RDD and the func using the broadcast variables.

    val threadMXBean = ManagementFactory.getThreadMXBean

    val deserializeStartTime = System.currentTimeMillis()

    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {

      threadMXBean.getCurrentThreadCpuTime

    } else 0L

    val ser = SparkEnv.get.closureSerializer.newInstance()

    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](

      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {

      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime

    } else 0L

   //直接调用用户自定义函数

    func(context, rdd.iterator(partition, context))

  }

==> org.apache.spark.rdd.RDD#iterator

 final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
　　 //结果不需要存储

    if (storageLevel != StorageLevel.NONE) {

      getOrCompute(split, context)

    } else {

      computeOrReadCheckpoint(split, context)

    }

  }

==> org.apache.spark.rdd.RDD#computeOrReadCheckpoint

  /**

   * Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing.

   */

  private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =

  {

    if (isCheckpointedAndMaterialized) {

      firstParent[T].iterator(split, context)

    } else {

     //核心方法, 此方法为虚方法,具体实现由具体 RDD 子类实现,如 MapPartitionsRDD,JdbcRDD等

      compute(split, context)

    }

  }

demo:

class MapPartitionsRDD[U: ClassTag, T: ClassTag](

    var prev: RDD[T],

    f: (TaskContext, Int, Iterator[T]) => Iterator[U],  // (TaskContext, partition index, iterator)

    preservesPartitioning: Boolean = false,

    isFromBarrier: Boolean = false,

    isOrderSensitive: Boolean = false)

  extends RDD[U](prev) {

  override def compute(split: Partition, context: TaskContext): Iterator[U] =

    f(context, split.index, firstParent[T].iterator(split, context))

}

class JdbcRDD[T: ClassTag](

    sc: SparkContext,

    getConnection: () => Connection,

    sql: String,

    lowerBound: Long,

    upperBound: Long,

    numPartitions: Int,

    mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _)

  extends RDD[T](sc, Nil) with Logging {

  override def getPartitions: Array[Partition] = {

    // bounds are inclusive, hence the + 1 here and - 1 on end

    val length = BigInt(1) + upperBound - lowerBound

    (0 until numPartitions).map { i =>

      val start = lowerBound + ((i * length) / numPartitions)

      val end = lowerBound + (((i + 1) * length) / numPartitions) - 1

      new JdbcPartition(i, start.toLong, end.toLong)

    }.toArray

  }

  override def compute(thePart: Partition, context: TaskContext): Iterator[T] = new NextIterator[T]

  {

    context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() }

    val part = thePart.asInstanceOf[JdbcPartition]

    val conn = getConnection()

    val stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)

    val url = conn.getMetaData.getURL

    val rs = stmt.executeQuery()

    override def getNext(): T = {

      if (rs.next()) {

        mapRow(rs)

      } else {

        finished = true

        null.asInstanceOf[T]

      }

    }

    override def close() {

    }

  }

}

spark源码分析, 任务反序列化及执行的更多相关文章

spark 源码分析之二十一 -- Task的执行流程
引言在上两篇文章 spark 源码分析之十九 -- DAG的生成和Stage的划分和 spark 源码分析之二十 -- Stage的提交中剖析了Spark的DAG的生成,Stage的划分以及St ...
Spark 源码分析 -- task实际执行过程
Spark源码分析 – SparkContext 中的例子, 只分析到sc.runJob 那么最终是怎么执行的? 通过DAGScheduler切分成Stage, 封装成taskset, 提交给Task ...
Spark源码分析之九：内存管理模型
Spark是现在很流行的一个基于内存的分布式计算框架,既然是基于内存,那么自然而然的,内存的管理就是Spark存储管理的重中之重了.那么,Spark究竟采用什么样的内存管理模型呢?本文就为大家揭开Sp ...
Spark源码分析之八：Task运行（二）
在<Spark源码分析之七:Task运行(一)>一文中,我们详细叙述了Task运行的整体流程,最终Task被传输到Executor上,启动一个对应的TaskRunner线程,并且在线程池中 ...
Spark源码分析之七：Task运行（一）
在Task调度相关的两篇文章<Spark源码分析之五:Task调度(一)>与<Spark源码分析之六:Task调度(二)>中,我们大致了解了Task调度相关的主要逻辑,并且在T ...
Spark源码分析之四：Stage提交
各位看官,上一篇<Spark源码分析之Stage划分>详细讲述了Spark中Stage的划分,下面,我们进入第三个阶段--Stage提交. Stage提交阶段的主要目的就一个,就是将每个S ...
Spark源码分析之三：Stage划分
继上篇<Spark源码分析之Job的调度模型与运行反馈>之后,我们继续来看第二阶段--Stage划分. Stage划分的大体流程如下图所示: 前面提到,对于JobSubmitted事件,我 ...
spark 源码分析之十六 -- Spark内存存储剖析
上篇spark 源码分析之十五 -- Spark内存管理剖析讲解了Spark的内存管理机制,主要是MemoryManager的内容.跟Spark的内存管理机制最密切相关的就是内存存储,本篇文章主要介 ...
spark 源码分析之十八 -- Spark存储体系剖析
本篇文章主要剖析BlockManager相关的类以及总结Spark底层存储体系. 总述先看 BlockManager相关类之间的关系如下: 我们从NettyRpcEnv 开始,做一下简单说明. Ne ...

随机推荐

Locust性能测试2--登录示例
无论是做接口自动化还是做压测,解决了登录就离成功进步了一大半,下面做个简单的登录案例,后续再说下数据依赖及参数化等问题 1. 登录登录示例 from locust import HttpUser, ...
Bellman-Ford算法例题：P3371 单源最短路径
看到还没人用Bellman-Ford过,赶紧水一发 lz非常弱,求各位大佬轻喷qwq 洛谷题目传送门:P3371 0."松弛"操作如果存在一条边$(u,v)$通过中继的方式可 ...
LCA详解
LCA,即最近公共祖先,在图论中应用比较广泛. LCA的定义如下:给定一个有根树,若节点$z$同时是节点$x$和节点$y$的祖先,则称$z$是$x,y$的公共祖先:在$x,y$的所有公共祖先当中深度最 ...
js实现csv下载
var TableDatas = '数据源';function getCSV(){ if(this.TableDatas.length === 0){ alert("没有数据呀呀呀呀!&q ...
Selenium使用cookis登录，并临时将cookis存储在本地【shelve数据库】
Python中自带了一个shelve库,可以帮助我们存储一些少量的数据. shelve数据库类似redis,是以[键值对]的方式进行数据的存储,有点像"字典"这种数据结构,存储在本 ...
unity 4种实现动态障碍方法
此文将介绍4种实现动态障碍的方法,2种基于navmesh,2种基于astar算法. 1.基于navmesh. 1.制作场景障碍: a.有几个独立的障碍物,就定义几个user area,即,一个场景仅仅 ...
Spine学习九 - 冰冻效果
想象这样一个效果,一个人被冰霜攻击命中,然后这个人整个就被冰冻了,那么spine动画要如何实现这个效果呢? 1.首先需要一个Spine动画,这个动画应该是相对静止的,因为人物已经被冰冻了,那么这个人儿 ...
java中equals与hashCode的重写问题
这几天有一个朋友问我在重写equals和hashCode上出现了问题,最后我帮她解决了问题,同时也整理出来分享给大家现上Object的equals与HashCode的代码 public boolea ...
[BUUOJ记录] [GYCTF]EasyThinking
主要考察ThinkPHP6.0的一个任意文件写入的CVE以及突破disable_function的方法. ThinkPHP6.0.0任意文件操作漏洞理论分析进入题目是一个简单的操作页面,dirma ...
一文搞懂WordPress建站
文章首发于:https://zouwang.vip/ 日日夜夜的等待,WordPress建站教程终于来了.本篇文章适用于第一次建站的小白,帮助你从零搭建起一个属于自己的网站,既然是从零,那么我就会带着 ...

spark源码分析, 任务反序列化及执行

spark源码分析, 任务反序列化及执行的更多相关文章

随机推荐

热门专题