6.7 块管理器BlockManager
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.spark.storage
- import java.io._
- import java.lang.ref.{ReferenceQueue => JReferenceQueue, WeakReference}
- import java.nio.ByteBuffer
- import java.nio.channels.Channels
- import java.util.Collections
- import java.util.concurrent.ConcurrentHashMap
- import scala.collection.mutable
- import scala.collection.mutable.HashMap
- import scala.concurrent.{ExecutionContext, Future}
- import scala.concurrent.duration._
- import scala.reflect.ClassTag
- import scala.util.Random
- import scala.util.control.NonFatal
- import com.codahale.metrics.{MetricRegistry, MetricSet}
- import org.apache.spark._
- import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics}
- import org.apache.spark.internal.{config, Logging}
- import org.apache.spark.memory.{MemoryManager, MemoryMode}
- import org.apache.spark.metrics.source.Source
- import org.apache.spark.network._
- import org.apache.spark.network.buffer.ManagedBuffer
- import org.apache.spark.network.netty.SparkTransportConf
- import org.apache.spark.network.shuffle.{ExternalShuffleClient, TempFileManager}
- import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
- import org.apache.spark.rpc.RpcEnv
- import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
- import org.apache.spark.shuffle.ShuffleManager
- import org.apache.spark.storage.memory._
- import org.apache.spark.unsafe.Platform
- import org.apache.spark.util._
- import org.apache.spark.util.io.ChunkedByteBuffer
- /* Class for returning a fetched block and associated metrics. */
- private[spark] class BlockResult(
- val data: Iterator[Any],
- val readMethod: DataReadMethod.Value,
- val bytes: Long)
- /**
- * Abstracts away how blocks are stored and provides different ways to read the underlying block
- * data. Callers should call [[dispose()]] when they're done with the block.
- */
- private[spark] trait BlockData {
- def toInputStream(): InputStream
- /**
- * Returns a Netty-friendly wrapper for the block's data.
- *
- * Please see `ManagedBuffer.convertToNetty()` for more details.
- */
- def toNetty(): Object
- def toChunkedByteBuffer(allocator: Int => ByteBuffer): ChunkedByteBuffer
- def toByteBuffer(): ByteBuffer
- def size: Long
- def dispose(): Unit
- }
- private[spark] class ByteBufferBlockData(
- val buffer: ChunkedByteBuffer,
- val shouldDispose: Boolean) extends BlockData {
- override def toInputStream(): InputStream = buffer.toInputStream(dispose = false)
- override def toNetty(): Object = buffer.toNetty
- override def toChunkedByteBuffer(allocator: Int => ByteBuffer): ChunkedByteBuffer = {
- buffer.copy(allocator)
- }
- override def toByteBuffer(): ByteBuffer = buffer.toByteBuffer
- override def size: Long = buffer.size
- override def dispose(): Unit = {
- if (shouldDispose) {
- buffer.dispose()
- }
- }
- }
- /**
- * Manager running on every node (driver and executors) which provides interfaces for putting and
- * retrieving blocks both locally and remotely into various stores (memory, disk, and off-heap).
- *
- * Note that [[initialize()]] must be called before the BlockManager is usable.
- */
- private[spark] class BlockManager(
- executorId: String,
- rpcEnv: RpcEnv,
- val master: BlockManagerMaster,
- val serializerManager: SerializerManager,
- val conf: SparkConf,
- memoryManager: MemoryManager,
- mapOutputTracker: MapOutputTracker,
- shuffleManager: ShuffleManager,
- val blockTransferService: BlockTransferService,
- securityManager: SecurityManager,
- numUsableCores: Int)
- extends BlockDataManager with BlockEvictionHandler with Logging {
- private[spark] val externalShuffleServiceEnabled =
- conf.getBoolean("spark.shuffle.service.enabled", false)
- val diskBlockManager = {
- // Only perform cleanup if an external service is not serving our shuffle files.
- val deleteFilesOnStop =
- !externalShuffleServiceEnabled || executorId == SparkContext.DRIVER_IDENTIFIER
- new DiskBlockManager(conf, deleteFilesOnStop)
- }
- // Visible for testing
- private[storage] val blockInfoManager = new BlockInfoManager
- private val futureExecutionContext = ExecutionContext.fromExecutorService(
- ThreadUtils.newDaemonCachedThreadPool("block-manager-future", ))
- // Actual storage of where blocks are kept
- private[spark] val memoryStore =
- new MemoryStore(conf, blockInfoManager, serializerManager, memoryManager, this)
- private[spark] val diskStore = new DiskStore(conf, diskBlockManager, securityManager)
- memoryManager.setMemoryStore(memoryStore)
- // Note: depending on the memory manager, `maxMemory` may actually vary over time.
- // However, since we use this only for reporting and logging, what we actually want here is
- // the absolute maximum value that `maxMemory` can ever possibly reach. We may need
- // to revisit whether reporting this value as the "max" is intuitive to the user.
- private val maxOnHeapMemory = memoryManager.maxOnHeapStorageMemory
- private val maxOffHeapMemory = memoryManager.maxOffHeapStorageMemory
- // Port used by the external shuffle service. In Yarn mode, this may be already be
- // set through the Hadoop configuration as the server is launched in the Yarn NM.
- private val externalShuffleServicePort = {
- val tmpPort = Utils.getSparkOrYarnConfig(conf, "spark.shuffle.service.port", "").toInt
- if (tmpPort == ) {
- // for testing, we set "spark.shuffle.service.port" to 0 in the yarn config, so yarn finds
- // an open port. But we still need to tell our spark apps the right port to use. So
- // only if the yarn config has the port set to 0, we prefer the value in the spark config
- conf.get("spark.shuffle.service.port").toInt
- } else {
- tmpPort
- }
- }
- var blockManagerId: BlockManagerId = _
- // Address of the server that serves this executor's shuffle files. This is either an external
- // service, or just our own Executor's BlockManager.
- private[spark] var shuffleServerId: BlockManagerId = _
- // Client to read other executors' shuffle files. This is either an external service, or just the
- // standard BlockTransferService to directly connect to other Executors.
- private[spark] val shuffleClient = if (externalShuffleServiceEnabled) {
- val transConf = SparkTransportConf.fromSparkConf(conf, "shuffle", numUsableCores)
- new ExternalShuffleClient(transConf, securityManager,
- securityManager.isAuthenticationEnabled(), conf.get(config.SHUFFLE_REGISTRATION_TIMEOUT))
- } else {
- blockTransferService
- }
- // Max number of failures before this block manager refreshes the block locations from the driver
- private val maxFailuresBeforeLocationRefresh =
- conf.getInt("spark.block.failures.beforeLocationRefresh", )
- private val slaveEndpoint = rpcEnv.setupEndpoint(
- "BlockManagerEndpoint" + BlockManager.ID_GENERATOR.next,
- new BlockManagerSlaveEndpoint(rpcEnv, this, mapOutputTracker))
- // Pending re-registration action being executed asynchronously or null if none is pending.
- // Accesses should synchronize on asyncReregisterLock.
- private var asyncReregisterTask: Future[Unit] = null
- private val asyncReregisterLock = new Object
- // Field related to peer block managers that are necessary for block replication
- @volatile private var cachedPeers: Seq[BlockManagerId] = _
- private val peerFetchLock = new Object
- private var lastPeerFetchTime = 0L
- private var blockReplicationPolicy: BlockReplicationPolicy = _
- // A TempFileManager used to track all the files of remote blocks which above the
- // specified memory threshold. Files will be deleted automatically based on weak reference.
- // Exposed for test
- private[storage] val remoteBlockTempFileManager =
- new BlockManager.RemoteBlockTempFileManager(this)
- private val maxRemoteBlockToMem = conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM)
- /**
- * Initializes the BlockManager with the given appId. This is not performed in the constructor as
- * the appId may not be known at BlockManager instantiation time (in particular for the driver,
- * where it is only learned after registration with the TaskScheduler).
- *
- * This method initializes the BlockTransferService and ShuffleClient, registers with the
- * BlockManagerMaster, starts the BlockManagerWorker endpoint, and registers with a local shuffle
- * service if configured.
- */
- def initialize(appId: String): Unit = {
- blockTransferService.init(this)
- shuffleClient.init(appId)
- blockReplicationPolicy = {
- val priorityClass = conf.get(
- "spark.storage.replication.policy", classOf[RandomBlockReplicationPolicy].getName)
- val clazz = Utils.classForName(priorityClass)
- val ret = clazz.newInstance.asInstanceOf[BlockReplicationPolicy]
- logInfo(s"Using $priorityClass for block replication policy")
- ret
- }
- val id =
- BlockManagerId(executorId, blockTransferService.hostName, blockTransferService.port, None)
- val idFromMaster = master.registerBlockManager(
- id,
- maxOnHeapMemory,
- maxOffHeapMemory,
- slaveEndpoint)
- blockManagerId = if (idFromMaster != null) idFromMaster else id
- shuffleServerId = if (externalShuffleServiceEnabled) {
- logInfo(s"external shuffle service port = $externalShuffleServicePort")
- BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)
- } else {
- blockManagerId
- }
- // Register Executors' configuration with the local shuffle service, if one should exist.
- if (externalShuffleServiceEnabled && !blockManagerId.isDriver) {
- registerWithExternalShuffleServer()
- }
- logInfo(s"Initialized BlockManager: $blockManagerId")
- }
- def shuffleMetricsSource: Source = {
- import BlockManager._
- if (externalShuffleServiceEnabled) {
- new ShuffleMetricsSource("ExternalShuffle", shuffleClient.shuffleMetrics())
- } else {
- new ShuffleMetricsSource("NettyBlockTransfer", shuffleClient.shuffleMetrics())
- }
- }
- private def registerWithExternalShuffleServer() {
- logInfo("Registering executor with local external shuffle service.")
- val shuffleConfig = new ExecutorShuffleInfo(
- diskBlockManager.localDirs.map(_.toString),
- diskBlockManager.subDirsPerLocalDir,
- shuffleManager.getClass.getName)
- val MAX_ATTEMPTS = conf.get(config.SHUFFLE_REGISTRATION_MAX_ATTEMPTS)
- val SLEEP_TIME_SECS =
- for (i <- to MAX_ATTEMPTS) {
- try {
- // Synchronous and will throw an exception if we cannot connect.
- shuffleClient.asInstanceOf[ExternalShuffleClient].registerWithShuffleServer(
- shuffleServerId.host, shuffleServerId.port, shuffleServerId.executorId, shuffleConfig)
- return
- } catch {
- case e: Exception if i < MAX_ATTEMPTS =>
- logError(s"Failed to connect to external shuffle server, will retry ${MAX_ATTEMPTS - i}"
- + s" more times after waiting $SLEEP_TIME_SECS seconds...", e)
- Thread.sleep(SLEEP_TIME_SECS * )
- case NonFatal(e) =>
- throw new SparkException("Unable to register with external shuffle server due to : " +
- e.getMessage, e)
- }
- }
- }
- /**
- * Report all blocks to the BlockManager again. This may be necessary if we are dropped
- * by the BlockManager and come back or if we become capable of recovering blocks on disk after
- * an executor crash.
- *
- * This function deliberately fails silently if the master returns false (indicating that
- * the slave needs to re-register). The error condition will be detected again by the next
- * heart beat attempt or new block registration and another try to re-register all blocks
- * will be made then.
- */
- private def reportAllBlocks(): Unit = {
- logInfo(s"Reporting ${blockInfoManager.size} blocks to the master.")
- for ((blockId, info) <- blockInfoManager.entries) {
- val status = getCurrentBlockStatus(blockId, info)
- if (info.tellMaster && !tryToReportBlockStatus(blockId, status)) {
- logError(s"Failed to report $blockId to master; giving up.")
- return
- }
- }
- }
- /**
- * Re-register with the master and report all blocks to it. This will be called by the heart beat
- * thread if our heartbeat to the block manager indicates that we were not registered.
- *
- * Note that this method must be called without any BlockInfo locks held.
- */
- def reregister(): Unit = {
- // TODO: We might need to rate limit re-registering.
- logInfo(s"BlockManager $blockManagerId re-registering with master")
- master.registerBlockManager(blockManagerId, maxOnHeapMemory, maxOffHeapMemory, slaveEndpoint)
- reportAllBlocks()
- }
- /**
- * Re-register with the master sometime soon.
- */
- private def asyncReregister(): Unit = {
- asyncReregisterLock.synchronized {
- if (asyncReregisterTask == null) {
- asyncReregisterTask = Future[Unit] {
- // This is a blocking action and should run in futureExecutionContext which is a cached
- // thread pool
- reregister()
- asyncReregisterLock.synchronized {
- asyncReregisterTask = null
- }
- }(futureExecutionContext)
- }
- }
- }
- /**
- * For testing. Wait for any pending asynchronous re-registration; otherwise, do nothing.
- */
- def waitForAsyncReregister(): Unit = {
- val task = asyncReregisterTask
- if (task != null) {
- try {
- ThreadUtils.awaitReady(task, Duration.Inf)
- } catch {
- case NonFatal(t) =>
- throw new Exception("Error occurred while waiting for async. reregistration", t)
- }
- }
- }
- /**
- * Interface to get local block data. Throws an exception if the block cannot be found or
- * cannot be read successfully.
- */
- override def getBlockData(blockId: BlockId): ManagedBuffer = {
- if (blockId.isShuffle) {
- shuffleManager.shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId])
- } else {
- getLocalBytes(blockId) match {
- case Some(blockData) =>
- new BlockManagerManagedBuffer(blockInfoManager, blockId, blockData, true)
- case None =>
- // If this block manager receives a request for a block that it doesn't have then it's
- // likely that the master has outdated block statuses for this block. Therefore, we send
- // an RPC so that this block is marked as being unavailable from this block manager.
- reportBlockStatus(blockId, BlockStatus.empty)
- throw new BlockNotFoundException(blockId.toString)
- }
- }
- }
- /**
- * Put the block locally, using the given storage level.
- *
- * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
- * so may corrupt or change the data stored by the `BlockManager`.
- */
- override def putBlockData(
- blockId: BlockId,
- data: ManagedBuffer,
- level: StorageLevel,
- classTag: ClassTag[_]): Boolean = {
- putBytes(blockId, new ChunkedByteBuffer(data.nioByteBuffer()), level)(classTag)
- }
- /**
- * Get the BlockStatus for the block identified by the given ID, if it exists.
- * NOTE: This is mainly for testing.
- */
- def getStatus(blockId: BlockId): Option[BlockStatus] = {
- blockInfoManager.get(blockId).map { info =>
- val memSize = if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
- val diskSize = if (diskStore.contains(blockId)) diskStore.getSize(blockId) else 0L
- BlockStatus(info.level, memSize = memSize, diskSize = diskSize)
- }
- }
- /**
- * Get the ids of existing blocks that match the given filter. Note that this will
- * query the blocks stored in the disk block manager (that the block manager
- * may not know of).
- */
- def getMatchingBlockIds(filter: BlockId => Boolean): Seq[BlockId] = {
- // The `toArray` is necessary here in order to force the list to be materialized so that we
- // don't try to serialize a lazy iterator when responding to client requests.
- (blockInfoManager.entries.map(_._1) ++ diskBlockManager.getAllBlocks())
- .filter(filter)
- .toArray
- .toSeq
- }
- /**
- * Tell the master about the current storage status of a block. This will send a block update
- * message reflecting the current status, *not* the desired storage level in its block info.
- * For example, a block with MEMORY_AND_DISK set might have fallen out to be only on disk.
- *
- * droppedMemorySize exists to account for when the block is dropped from memory to disk (so
- * it is still valid). This ensures that update in master will compensate for the increase in
- * memory on slave.
- */
- private def reportBlockStatus(
- blockId: BlockId,
- status: BlockStatus,
- droppedMemorySize: Long = 0L): Unit = {
- val needReregister = !tryToReportBlockStatus(blockId, status, droppedMemorySize)
- if (needReregister) {
- logInfo(s"Got told to re-register updating block $blockId")
- // Re-registering will report our new block for free.
- asyncReregister()
- }
- logDebug(s"Told master about block $blockId")
- }
- /**
- * Actually send a UpdateBlockInfo message. Returns the master's response,
- * which will be true if the block was successfully recorded and false if
- * the slave needs to re-register.
- */
- private def tryToReportBlockStatus(
- blockId: BlockId,
- status: BlockStatus,
- droppedMemorySize: Long = 0L): Boolean = {
- val storageLevel = status.storageLevel
- val inMemSize = Math.max(status.memSize, droppedMemorySize)
- val onDiskSize = status.diskSize
- master.updateBlockInfo(blockManagerId, blockId, storageLevel, inMemSize, onDiskSize)
- }
- /**
- * Return the updated storage status of the block with the given ID. More specifically, if
- * the block is dropped from memory and possibly added to disk, return the new storage level
- * and the updated in-memory and on-disk sizes.
- */
- private def getCurrentBlockStatus(blockId: BlockId, info: BlockInfo): BlockStatus = {
- info.synchronized {
- info.level match {
- case null =>
- BlockStatus.empty
- case level =>
- val inMem = level.useMemory && memoryStore.contains(blockId)
- val onDisk = level.useDisk && diskStore.contains(blockId)
- val deserialized = if (inMem) level.deserialized else false
- val replication = if (inMem || onDisk) level.replication else
- val storageLevel = StorageLevel(
- useDisk = onDisk,
- useMemory = inMem,
- useOffHeap = level.useOffHeap,
- deserialized = deserialized,
- replication = replication)
- val memSize = if (inMem) memoryStore.getSize(blockId) else 0L
- val diskSize = if (onDisk) diskStore.getSize(blockId) else 0L
- BlockStatus(storageLevel, memSize, diskSize)
- }
- }
- }
- /**
- * Get locations of an array of blocks.
- */
- private def getLocationBlockIds(blockIds: Array[BlockId]): Array[Seq[BlockManagerId]] = {
- val startTimeMs = System.currentTimeMillis
- val locations = master.getLocations(blockIds).toArray
- logDebug("Got multiple block location in %s".format(Utils.getUsedTimeMs(startTimeMs)))
- locations
- }
- /**
- * Cleanup code run in response to a failed local read.
- * Must be called while holding a read lock on the block.
- */
- private def handleLocalReadFailure(blockId: BlockId): Nothing = {
- releaseLock(blockId)
- // Remove the missing block so that its unavailability is reported to the driver
- removeBlock(blockId)
- throw new SparkException(s"Block $blockId was not found even though it's read-locked")
- }
- /**
- * Get block from local block manager as an iterator of Java objects.
- */
- def getLocalValues(blockId: BlockId): Option[BlockResult] = {
- logDebug(s"Getting local block $blockId")
- blockInfoManager.lockForReading(blockId) match {
- case None =>
- logDebug(s"Block $blockId was not found")
- None
- case Some(info) =>
- val level = info.level
- logDebug(s"Level for block $blockId is $level")
- val taskAttemptId = Option(TaskContext.get()).map(_.taskAttemptId())
- if (level.useMemory && memoryStore.contains(blockId)) {
- val iter: Iterator[Any] = if (level.deserialized) {
- memoryStore.getValues(blockId).get
- } else {
- serializerManager.dataDeserializeStream(
- blockId, memoryStore.getBytes(blockId).get.toInputStream())(info.classTag)
- }
- // We need to capture the current taskId in case the iterator completion is triggered
- // from a different thread which does not have TaskContext set; see SPARK-18406 for
- // discussion.
- val ci = CompletionIterator[Any, Iterator[Any]](iter, {
- releaseLock(blockId, taskAttemptId)
- })
- Some(new BlockResult(ci, DataReadMethod.Memory, info.size))
- } else if (level.useDisk && diskStore.contains(blockId)) {
- val diskData = diskStore.getBytes(blockId)
- val iterToReturn: Iterator[Any] = {
- if (level.deserialized) {
- val diskValues = serializerManager.dataDeserializeStream(
- blockId,
- diskData.toInputStream())(info.classTag)
- maybeCacheDiskValuesInMemory(info, blockId, level, diskValues)
- } else {
- val stream = maybeCacheDiskBytesInMemory(info, blockId, level, diskData)
- .map { _.toInputStream(dispose = false) }
- .getOrElse { diskData.toInputStream() }
- serializerManager.dataDeserializeStream(blockId, stream)(info.classTag)
- }
- }
- val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, {
- releaseLockAndDispose(blockId, diskData, taskAttemptId)
- })
- Some(new BlockResult(ci, DataReadMethod.Disk, info.size))
- } else {
- handleLocalReadFailure(blockId)
- }
- }
- }
- /**
- * Get block from the local block manager as serialized bytes.
- */
- def getLocalBytes(blockId: BlockId): Option[BlockData] = {
- logDebug(s"Getting local block $blockId as bytes")
- // As an optimization for map output fetches, if the block is for a shuffle, return it
- // without acquiring a lock; the disk store never deletes (recent) items so this should work
- if (blockId.isShuffle) {
- val shuffleBlockResolver = shuffleManager.shuffleBlockResolver
- // TODO: This should gracefully handle case where local block is not available. Currently
- // downstream code will throw an exception.
- val buf = new ChunkedByteBuffer(
- shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId]).nioByteBuffer())
- Some(new ByteBufferBlockData(buf, true))
- } else {
- blockInfoManager.lockForReading(blockId).map { info => doGetLocalBytes(blockId, info) }
- }
- }
- /**
- * Get block from the local block manager as serialized bytes.
- *
- * Must be called while holding a read lock on the block.
- * Releases the read lock upon exception; keeps the read lock upon successful return.
- */
- private def doGetLocalBytes(blockId: BlockId, info: BlockInfo): BlockData = {
- val level = info.level
- logDebug(s"Level for block $blockId is $level")
- // In order, try to read the serialized bytes from memory, then from disk, then fall back to
- // serializing in-memory objects, and, finally, throw an exception if the block does not exist.
- if (level.deserialized) {
- // Try to avoid expensive serialization by reading a pre-serialized copy from disk:
- if (level.useDisk && diskStore.contains(blockId)) {
- // Note: we purposely do not try to put the block back into memory here. Since this branch
- // handles deserialized blocks, this block may only be cached in memory as objects, not
- // serialized bytes. Because the caller only requested bytes, it doesn't make sense to
- // cache the block's deserialized objects since that caching may not have a payoff.
- diskStore.getBytes(blockId)
- } else if (level.useMemory && memoryStore.contains(blockId)) {
- // The block was not found on disk, so serialize an in-memory copy:
- new ByteBufferBlockData(serializerManager.dataSerializeWithExplicitClassTag(
- blockId, memoryStore.getValues(blockId).get, info.classTag), true)
- } else {
- handleLocalReadFailure(blockId)
- }
- } else { // storage level is serialized
- if (level.useMemory && memoryStore.contains(blockId)) {
- new ByteBufferBlockData(memoryStore.getBytes(blockId).get, false)
- } else if (level.useDisk && diskStore.contains(blockId)) {
- val diskData = diskStore.getBytes(blockId)
- maybeCacheDiskBytesInMemory(info, blockId, level, diskData)
- .map(new ByteBufferBlockData(_, false))
- .getOrElse(diskData)
- } else {
- handleLocalReadFailure(blockId)
- }
- }
- }
- /**
- * Get block from remote block managers.
- *
- * This does not acquire a lock on this block in this JVM.
- */
- private def getRemoteValues[T: ClassTag](blockId: BlockId): Option[BlockResult] = {
- val ct = implicitly[ClassTag[T]]
- getRemoteBytes(blockId).map { data =>
- val values =
- serializerManager.dataDeserializeStream(blockId, data.toInputStream(dispose = true))(ct)
- new BlockResult(values, DataReadMethod.Network, data.size)
- }
- }
- /**
- * Return a list of locations for the given block, prioritizing the local machine since
- * multiple block managers can share the same host, followed by hosts on the same rack.
- */
- private def sortLocations(locations: Seq[BlockManagerId]): Seq[BlockManagerId] = {
- val locs = Random.shuffle(locations)
- val (preferredLocs, otherLocs) = locs.partition { loc => blockManagerId.host == loc.host }
- blockManagerId.topologyInfo match {
- case None => preferredLocs ++ otherLocs
- case Some(_) =>
- val (sameRackLocs, differentRackLocs) = otherLocs.partition {
- loc => blockManagerId.topologyInfo == loc.topologyInfo
- }
- preferredLocs ++ sameRackLocs ++ differentRackLocs
- }
- }
- /**
- * Get block from remote block managers as serialized bytes.
- */
- def getRemoteBytes(blockId: BlockId): Option[ChunkedByteBuffer] = {
- logDebug(s"Getting remote block $blockId")
- require(blockId != null, "BlockId is null")
- var runningFailureCount =
- var totalFailureCount =
- // Because all the remote blocks are registered in driver, it is not necessary to ask
- // all the slave executors to get block status.
- val locationsAndStatus = master.getLocationsAndStatus(blockId)
- val blockSize = locationsAndStatus.map { b =>
- b.status.diskSize.max(b.status.memSize)
- }.getOrElse(0L)
- val blockLocations = locationsAndStatus.map(_.locations).getOrElse(Seq.empty)
- // If the block size is above the threshold, we should pass our FileManger to
- // BlockTransferService, which will leverage it to spill the block; if not, then passed-in
- // null value means the block will be persisted in memory.
- val tempFileManager = if (blockSize > maxRemoteBlockToMem) {
- remoteBlockTempFileManager
- } else {
- null
- }
- val locations = sortLocations(blockLocations)
- val maxFetchFailures = locations.size
- var locationIterator = locations.iterator
- while (locationIterator.hasNext) {
- val loc = locationIterator.next()
- logDebug(s"Getting remote block $blockId from $loc")
- val data = try {
- blockTransferService.fetchBlockSync(
- loc.host, loc.port, loc.executorId, blockId.toString, tempFileManager).nioByteBuffer()
- } catch {
- case NonFatal(e) =>
- runningFailureCount +=
- totalFailureCount +=
- if (totalFailureCount >= maxFetchFailures) {
- // Give up trying anymore locations. Either we've tried all of the original locations,
- // or we've refreshed the list of locations from the master, and have still
- // hit failures after trying locations from the refreshed list.
- logWarning(s"Failed to fetch block after $totalFailureCount fetch failures. " +
- s"Most recent failure cause:", e)
- return None
- }
- logWarning(s"Failed to fetch remote block $blockId " +
- s"from $loc (failed attempt $runningFailureCount)", e)
- // If there is a large number of executors then locations list can contain a
- // large number of stale entries causing a large number of retries that may
- // take a significant amount of time. To get rid of these stale entries
- // we refresh the block locations after a certain number of fetch failures
- if (runningFailureCount >= maxFailuresBeforeLocationRefresh) {
- locationIterator = sortLocations(master.getLocations(blockId)).iterator
- logDebug(s"Refreshed locations from the driver " +
- s"after ${runningFailureCount} fetch failures.")
- runningFailureCount =
- }
- // This location failed, so we retry fetch from a different one by returning null here
- null
- }
- if (data != null) {
- return Some(new ChunkedByteBuffer(data))
- }
- logDebug(s"The value of block $blockId is null")
- }
- logDebug(s"Block $blockId not found")
- None
- }
- /**
- * Get a block from the block manager (either local or remote).
- *
- * This acquires a read lock on the block if the block was stored locally and does not acquire
- * any locks if the block was fetched from a remote block manager. The read lock will
- * automatically be freed once the result's `data` iterator is fully consumed.
- */
- def get[T: ClassTag](blockId: BlockId): Option[BlockResult] = {
- val local = getLocalValues(blockId)
- if (local.isDefined) {
- logInfo(s"Found block $blockId locally")
- return local
- }
- val remote = getRemoteValues[T](blockId)
- if (remote.isDefined) {
- logInfo(s"Found block $blockId remotely")
- return remote
- }
- None
- }
- /**
- * Downgrades an exclusive write lock to a shared read lock.
- */
- def downgradeLock(blockId: BlockId): Unit = {
- blockInfoManager.downgradeLock(blockId)
- }
- /**
- * Release a lock on the given block with explicit TID.
- * The param `taskAttemptId` should be passed in case we can't get the correct TID from
- * TaskContext, for example, the input iterator of a cached RDD iterates to the end in a child
- * thread.
- */
- def releaseLock(blockId: BlockId, taskAttemptId: Option[Long] = None): Unit = {
- blockInfoManager.unlock(blockId, taskAttemptId)
- }
- /**
- * Registers a task with the BlockManager in order to initialize per-task bookkeeping structures.
- */
- def registerTask(taskAttemptId: Long): Unit = {
- blockInfoManager.registerTask(taskAttemptId)
- }
- /**
- * Release all locks for the given task.
- *
- * @return the blocks whose locks were released.
- */
- def releaseAllLocksForTask(taskAttemptId: Long): Seq[BlockId] = {
- blockInfoManager.releaseAllLocksForTask(taskAttemptId)
- }
- /**
- * Retrieve the given block if it exists, otherwise call the provided `makeIterator` method
- * to compute the block, persist it, and return its values.
- *
- * @return either a BlockResult if the block was successfully cached, or an iterator if the block
- * could not be cached.
- */
- def getOrElseUpdate[T](
- blockId: BlockId,
- level: StorageLevel,
- classTag: ClassTag[T],
- makeIterator: () => Iterator[T]): Either[BlockResult, Iterator[T]] = {
- // Attempt to read the block from local or remote storage. If it's present, then we don't need
- // to go through the local-get-or-put path.
- get[T](blockId)(classTag) match {
- case Some(block) =>
- return Left(block)
- case _ =>
- // Need to compute the block.
- }
- // Initially we hold no locks on this block.
- doPutIterator(blockId, makeIterator, level, classTag, keepReadLock = true) match {
- case None =>
- // doPut() didn't hand work back to us, so the block already existed or was successfully
- // stored. Therefore, we now hold a read lock on the block.
- val blockResult = getLocalValues(blockId).getOrElse {
- // Since we held a read lock between the doPut() and get() calls, the block should not
- // have been evicted, so get() not returning the block indicates some internal error.
- releaseLock(blockId)
- throw new SparkException(s"get() failed for block $blockId even though we held a lock")
- }
- // We already hold a read lock on the block from the doPut() call and getLocalValues()
- // acquires the lock again, so we need to call releaseLock() here so that the net number
- // of lock acquisitions is 1 (since the caller will only call release() once).
- releaseLock(blockId)
- Left(blockResult)
- case Some(iter) =>
- // The put failed, likely because the data was too large to fit in memory and could not be
- // dropped to disk. Therefore, we need to pass the input iterator back to the caller so
- // that they can decide what to do with the values (e.g. process them without caching).
- Right(iter)
- }
- }
- /**
- * @return true if the block was stored or false if an error occurred.
- */
- def putIterator[T: ClassTag](
- blockId: BlockId,
- values: Iterator[T],
- level: StorageLevel,
- tellMaster: Boolean = true): Boolean = {
- require(values != null, "Values is null")
- doPutIterator(blockId, () => values, level, implicitly[ClassTag[T]], tellMaster) match {
- case None =>
- true
- case Some(iter) =>
- // Caller doesn't care about the iterator values, so we can close the iterator here
- // to free resources earlier
- iter.close()
- false
- }
- }
- /**
- * A short circuited method to get a block writer that can write data directly to disk.
- * The Block will be appended to the File specified by filename. Callers should handle error
- * cases.
- */
- def getDiskWriter(
- blockId: BlockId,
- file: File,
- serializerInstance: SerializerInstance,
- bufferSize: Int,
- writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
- val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
- new DiskBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,
- syncWrites, writeMetrics, blockId)
- }
- /**
- * Put a new block of serialized bytes to the block manager.
- *
- * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
- * so may corrupt or change the data stored by the `BlockManager`.
- *
- * @return true if the block was stored or false if an error occurred.
- */
- def putBytes[T: ClassTag](
- blockId: BlockId,
- bytes: ChunkedByteBuffer,
- level: StorageLevel,
- tellMaster: Boolean = true): Boolean = {
- require(bytes != null, "Bytes is null")
- doPutBytes(blockId, bytes, level, implicitly[ClassTag[T]], tellMaster)
- }
- /**
- * Put the given bytes according to the given level in one of the block stores, replicating
- * the values if necessary.
- *
- * If the block already exists, this method will not overwrite it.
- *
- * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
- * so may corrupt or change the data stored by the `BlockManager`.
- *
- * @param keepReadLock if true, this method will hold the read lock when it returns (even if the
- * block already exists). If false, this method will hold no locks when it
- * returns.
- * @return true if the block was already present or if the put succeeded, false otherwise.
- */
- private def doPutBytes[T](
- blockId: BlockId,
- bytes: ChunkedByteBuffer,
- level: StorageLevel,
- classTag: ClassTag[T],
- tellMaster: Boolean = true,
- keepReadLock: Boolean = false): Boolean = {
- doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
- val startTimeMs = System.currentTimeMillis
- // Since we're storing bytes, initiate the replication before storing them locally.
- // This is faster as data is already serialized and ready to send.
- val replicationFuture = if (level.replication > ) {
- Future {
- // This is a blocking action and should run in futureExecutionContext which is a cached
- // thread pool. The ByteBufferBlockData wrapper is not disposed of to avoid releasing
- // buffers that are owned by the caller.
- replicate(blockId, new ByteBufferBlockData(bytes, false), level, classTag)
- }(futureExecutionContext)
- } else {
- null
- }
- val size = bytes.size
- if (level.useMemory) {
- // Put it in memory first, even if it also has useDisk set to true;
- // We will drop it to disk later if the memory store can't hold it.
- val putSucceeded = if (level.deserialized) {
- val values =
- serializerManager.dataDeserializeStream(blockId, bytes.toInputStream())(classTag)
- memoryStore.putIteratorAsValues(blockId, values, classTag) match {
- case Right(_) => true
- case Left(iter) =>
- // If putting deserialized values in memory failed, we will put the bytes directly to
- // disk, so we don't need this iterator and can close it to free resources earlier.
- iter.close()
- false
- }
- } else {
- val memoryMode = level.memoryMode
- memoryStore.putBytes(blockId, size, memoryMode, () => {
- if (memoryMode == MemoryMode.OFF_HEAP &&
- bytes.chunks.exists(buffer => !buffer.isDirect)) {
- bytes.copy(Platform.allocateDirectBuffer)
- } else {
- bytes
- }
- })
- }
- if (!putSucceeded && level.useDisk) {
- logWarning(s"Persisting block $blockId to disk instead.")
- diskStore.putBytes(blockId, bytes)
- }
- } else if (level.useDisk) {
- diskStore.putBytes(blockId, bytes)
- }
- val putBlockStatus = getCurrentBlockStatus(blockId, info)
- val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
- if (blockWasSuccessfullyStored) {
- // Now that the block is in either the memory or disk store,
- // tell the master about it.
- info.size = size
- if (tellMaster && info.tellMaster) {
- reportBlockStatus(blockId, putBlockStatus)
- }
- addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
- }
- logDebug("Put block %s locally took %s".format(blockId, Utils.getUsedTimeMs(startTimeMs)))
- if (level.replication > ) {
- // Wait for asynchronous replication to finish
- try {
- ThreadUtils.awaitReady(replicationFuture, Duration.Inf)
- } catch {
- case NonFatal(t) =>
- throw new Exception("Error occurred while waiting for replication to finish", t)
- }
- }
- if (blockWasSuccessfullyStored) {
- None
- } else {
- Some(bytes)
- }
- }.isEmpty
- }
- /**
- * Helper method used to abstract common code from [[doPutBytes()]] and [[doPutIterator()]].
- *
- * @param putBody a function which attempts the actual put() and returns None on success
- * or Some on failure.
- */
- private def doPut[T](
- blockId: BlockId,
- level: StorageLevel,
- classTag: ClassTag[_],
- tellMaster: Boolean,
- keepReadLock: Boolean)(putBody: BlockInfo => Option[T]): Option[T] = {
- require(blockId != null, "BlockId is null")
- require(level != null && level.isValid, "StorageLevel is null or invalid")
- val putBlockInfo = {
- val newInfo = new BlockInfo(level, classTag, tellMaster)
- if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo)) {
- newInfo
- } else {
- logWarning(s"Block $blockId already exists on this machine; not re-adding it")
- if (!keepReadLock) {
- // lockNewBlockForWriting returned a read lock on the existing block, so we must free it:
- releaseLock(blockId)
- }
- return None
- }
- }
- val startTimeMs = System.currentTimeMillis
- var exceptionWasThrown: Boolean = true
- val result: Option[T] = try {
- val res = putBody(putBlockInfo)
- exceptionWasThrown = false
- if (res.isEmpty) {
- // the block was successfully stored
- if (keepReadLock) {
- blockInfoManager.downgradeLock(blockId)
- } else {
- blockInfoManager.unlock(blockId)
- }
- } else {
- removeBlockInternal(blockId, tellMaster = false)
- logWarning(s"Putting block $blockId failed")
- }
- res
- } catch {
- // Since removeBlockInternal may throw exception,
- // we should print exception first to show root cause.
- case NonFatal(e) =>
- logWarning(s"Putting block $blockId failed due to exception $e.")
- throw e
- } finally {
- // This cleanup is performed in a finally block rather than a `catch` to avoid having to
- // catch and properly re-throw InterruptedException.
- if (exceptionWasThrown) {
- // If an exception was thrown then it's possible that the code in `putBody` has already
- // notified the master about the availability of this block, so we need to send an update
- // to remove this block location.
- removeBlockInternal(blockId, tellMaster = tellMaster)
- // The `putBody` code may have also added a new block status to TaskMetrics, so we need
- // to cancel that out by overwriting it with an empty block status. We only do this if
- // the finally block was entered via an exception because doing this unconditionally would
- // cause us to send empty block statuses for every block that failed to be cached due to
- // a memory shortage (which is an expected failure, unlike an uncaught exception).
- addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
- }
- }
- if (level.replication > ) {
- logDebug("Putting block %s with replication took %s"
- .format(blockId, Utils.getUsedTimeMs(startTimeMs)))
- } else {
- logDebug("Putting block %s without replication took %s"
- .format(blockId, Utils.getUsedTimeMs(startTimeMs)))
- }
- result
- }
- /**
- * Put the given block according to the given level in one of the block stores, replicating
- * the values if necessary.
- *
- * If the block already exists, this method will not overwrite it.
- *
- * @param keepReadLock if true, this method will hold the read lock when it returns (even if the
- * block already exists). If false, this method will hold no locks when it
- * returns.
- * @return None if the block was already present or if the put succeeded, or Some(iterator)
- * if the put failed.
- */
- private def doPutIterator[T](
- blockId: BlockId,
- iterator: () => Iterator[T],
- level: StorageLevel,
- classTag: ClassTag[T],
- tellMaster: Boolean = true,
- keepReadLock: Boolean = false): Option[PartiallyUnrolledIterator[T]] = {
- doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
- val startTimeMs = System.currentTimeMillis
- var iteratorFromFailedMemoryStorePut: Option[PartiallyUnrolledIterator[T]] = None
- // Size of the block in bytes
- var size = 0L
- if (level.useMemory) {
- // Put it in memory first, even if it also has useDisk set to true;
- // We will drop it to disk later if the memory store can't hold it.
- if (level.deserialized) {
- memoryStore.putIteratorAsValues(blockId, iterator(), classTag) match {
- case Right(s) =>
- size = s
- case Left(iter) =>
- // Not enough space to unroll this block; drop to disk if applicable
- if (level.useDisk) {
- logWarning(s"Persisting block $blockId to disk instead.")
- diskStore.put(blockId) { channel =>
- val out = Channels.newOutputStream(channel)
- serializerManager.dataSerializeStream(blockId, out, iter)(classTag)
- }
- size = diskStore.getSize(blockId)
- } else {
- iteratorFromFailedMemoryStorePut = Some(iter)
- }
- }
- } else { // !level.deserialized
- memoryStore.putIteratorAsBytes(blockId, iterator(), classTag, level.memoryMode) match {
- case Right(s) =>
- size = s
- case Left(partiallySerializedValues) =>
- // Not enough space to unroll this block; drop to disk if applicable
- if (level.useDisk) {
- logWarning(s"Persisting block $blockId to disk instead.")
- diskStore.put(blockId) { channel =>
- val out = Channels.newOutputStream(channel)
- partiallySerializedValues.finishWritingToStream(out)
- }
- size = diskStore.getSize(blockId)
- } else {
- iteratorFromFailedMemoryStorePut = Some(partiallySerializedValues.valuesIterator)
- }
- }
- }
- } else if (level.useDisk) {
- diskStore.put(blockId) { channel =>
- val out = Channels.newOutputStream(channel)
- serializerManager.dataSerializeStream(blockId, out, iterator())(classTag)
- }
- size = diskStore.getSize(blockId)
- }
- val putBlockStatus = getCurrentBlockStatus(blockId, info)
- val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
- if (blockWasSuccessfullyStored) {
- // Now that the block is in either the memory or disk store, tell the master about it.
- info.size = size
- if (tellMaster && info.tellMaster) {
- reportBlockStatus(blockId, putBlockStatus)
- }
- addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
- logDebug("Put block %s locally took %s".format(blockId, Utils.getUsedTimeMs(startTimeMs)))
- if (level.replication > ) {
- val remoteStartTime = System.currentTimeMillis
- val bytesToReplicate = doGetLocalBytes(blockId, info)
- // [SPARK-16550] Erase the typed classTag when using default serialization, since
- // NettyBlockRpcServer crashes when deserializing repl-defined classes.
- // TODO(ekl) remove this once the classloader issue on the remote end is fixed.
- val remoteClassTag = if (!serializerManager.canUseKryo(classTag)) {
- scala.reflect.classTag[Any]
- } else {
- classTag
- }
- try {
- replicate(blockId, bytesToReplicate, level, remoteClassTag)
- } finally {
- bytesToReplicate.dispose()
- }
- logDebug("Put block %s remotely took %s"
- .format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
- }
- }
- assert(blockWasSuccessfullyStored == iteratorFromFailedMemoryStorePut.isEmpty)
- iteratorFromFailedMemoryStorePut
- }
- }
- /**
- * Attempts to cache spilled bytes read from disk into the MemoryStore in order to speed up
- * subsequent reads. This method requires the caller to hold a read lock on the block.
- *
- * @return a copy of the bytes from the memory store if the put succeeded, otherwise None.
- * If this returns bytes from the memory store then the original disk store bytes will
- * automatically be disposed and the caller should not continue to use them. Otherwise,
- * if this returns None then the original disk store bytes will be unaffected.
- */
- private def maybeCacheDiskBytesInMemory(
- blockInfo: BlockInfo,
- blockId: BlockId,
- level: StorageLevel,
- diskData: BlockData): Option[ChunkedByteBuffer] = {
- require(!level.deserialized)
- if (level.useMemory) {
- // Synchronize on blockInfo to guard against a race condition where two readers both try to
- // put values read from disk into the MemoryStore.
- blockInfo.synchronized {
- if (memoryStore.contains(blockId)) {
- diskData.dispose()
- Some(memoryStore.getBytes(blockId).get)
- } else {
- val allocator = level.memoryMode match {
- case MemoryMode.ON_HEAP => ByteBuffer.allocate _
- case MemoryMode.OFF_HEAP => Platform.allocateDirectBuffer _
- }
- val putSucceeded = memoryStore.putBytes(blockId, diskData.size, level.memoryMode, () => {
- // https://issues.apache.org/jira/browse/SPARK-6076
- // If the file size is bigger than the free memory, OOM will happen. So if we
- // cannot put it into MemoryStore, copyForMemory should not be created. That's why
- // this action is put into a `() => ChunkedByteBuffer` and created lazily.
- diskData.toChunkedByteBuffer(allocator)
- })
- if (putSucceeded) {
- diskData.dispose()
- Some(memoryStore.getBytes(blockId).get)
- } else {
- None
- }
- }
- }
- } else {
- None
- }
- }
- /**
- * Attempts to cache spilled values read from disk into the MemoryStore in order to speed up
- * subsequent reads. This method requires the caller to hold a read lock on the block.
- *
- * @return a copy of the iterator. The original iterator passed this method should no longer
- * be used after this method returns.
- */
- private def maybeCacheDiskValuesInMemory[T](
- blockInfo: BlockInfo,
- blockId: BlockId,
- level: StorageLevel,
- diskIterator: Iterator[T]): Iterator[T] = {
- require(level.deserialized)
- val classTag = blockInfo.classTag.asInstanceOf[ClassTag[T]]
- if (level.useMemory) {
- // Synchronize on blockInfo to guard against a race condition where two readers both try to
- // put values read from disk into the MemoryStore.
- blockInfo.synchronized {
- if (memoryStore.contains(blockId)) {
- // Note: if we had a means to discard the disk iterator, we would do that here.
- memoryStore.getValues(blockId).get
- } else {
- memoryStore.putIteratorAsValues(blockId, diskIterator, classTag) match {
- case Left(iter) =>
- // The memory store put() failed, so it returned the iterator back to us:
- iter
- case Right(_) =>
- // The put() succeeded, so we can read the values back:
- memoryStore.getValues(blockId).get
- }
- }
- }.asInstanceOf[Iterator[T]]
- } else {
- diskIterator
- }
- }
- /**
- * Get peer block managers in the system.
- */
- private def getPeers(forceFetch: Boolean): Seq[BlockManagerId] = {
- peerFetchLock.synchronized {
- val cachedPeersTtl = conf.getInt("spark.storage.cachedPeersTtl", * ) // milliseconds
- val timeout = System.currentTimeMillis - lastPeerFetchTime > cachedPeersTtl
- if (cachedPeers == null || forceFetch || timeout) {
- cachedPeers = master.getPeers(blockManagerId).sortBy(_.hashCode)
- lastPeerFetchTime = System.currentTimeMillis
- logDebug("Fetched peers from master: " + cachedPeers.mkString("[", ",", "]"))
- }
- cachedPeers
- }
- }
- /**
- * Called for pro-active replenishment of blocks lost due to executor failures
- *
- * @param blockId blockId being replicate
- * @param existingReplicas existing block managers that have a replica
- * @param maxReplicas maximum replicas needed
- */
- def replicateBlock(
- blockId: BlockId,
- existingReplicas: Set[BlockManagerId],
- maxReplicas: Int): Unit = {
- logInfo(s"Using $blockManagerId to pro-actively replicate $blockId")
- blockInfoManager.lockForReading(blockId).foreach { info =>
- val data = doGetLocalBytes(blockId, info)
- val storageLevel = StorageLevel(
- useDisk = info.level.useDisk,
- useMemory = info.level.useMemory,
- useOffHeap = info.level.useOffHeap,
- deserialized = info.level.deserialized,
- replication = maxReplicas)
- // we know we are called as a result of an executor removal, so we refresh peer cache
- // this way, we won't try to replicate to a missing executor with a stale reference
- getPeers(forceFetch = true)
- try {
- replicate(blockId, data, storageLevel, info.classTag, existingReplicas)
- } finally {
- logDebug(s"Releasing lock for $blockId")
- releaseLockAndDispose(blockId, data)
- }
- }
- }
- /**
- * Replicate block to another node. Note that this is a blocking call that returns after
- * the block has been replicated.
- */
- private def replicate(
- blockId: BlockId,
- data: BlockData,
- level: StorageLevel,
- classTag: ClassTag[_],
- existingReplicas: Set[BlockManagerId] = Set.empty): Unit = {
- val maxReplicationFailures = conf.getInt("spark.storage.maxReplicationFailures", )
- val tLevel = StorageLevel(
- useDisk = level.useDisk,
- useMemory = level.useMemory,
- useOffHeap = level.useOffHeap,
- deserialized = level.deserialized,
- replication = )
- val numPeersToReplicateTo = level.replication -
- val startTime = System.nanoTime
- val peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
- val peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
- var numFailures =
- val initialPeers = getPeers(false).filterNot(existingReplicas.contains)
- var peersForReplication = blockReplicationPolicy.prioritize(
- blockManagerId,
- initialPeers,
- peersReplicatedTo,
- blockId,
- numPeersToReplicateTo)
- while(numFailures <= maxReplicationFailures &&
- !peersForReplication.isEmpty &&
- peersReplicatedTo.size < numPeersToReplicateTo) {
- val peer = peersForReplication.head
- try {
- val onePeerStartTime = System.nanoTime
- logTrace(s"Trying to replicate $blockId of ${data.size} bytes to $peer")
- blockTransferService.uploadBlockSync(
- peer.host,
- peer.port,
- peer.executorId,
- blockId,
- new BlockManagerManagedBuffer(blockInfoManager, blockId, data, false),
- tLevel,
- classTag)
- logTrace(s"Replicated $blockId of ${data.size} bytes to $peer" +
- s" in ${(System.nanoTime - onePeerStartTime).toDouble / 1e6} ms")
- peersForReplication = peersForReplication.tail
- peersReplicatedTo += peer
- } catch {
- case NonFatal(e) =>
- logWarning(s"Failed to replicate $blockId to $peer, failure #$numFailures", e)
- peersFailedToReplicateTo += peer
- // we have a failed replication, so we get the list of peers again
- // we don't want peers we have already replicated to and the ones that
- // have failed previously
- val filteredPeers = getPeers(true).filter { p =>
- !peersFailedToReplicateTo.contains(p) && !peersReplicatedTo.contains(p)
- }
- numFailures +=
- peersForReplication = blockReplicationPolicy.prioritize(
- blockManagerId,
- filteredPeers,
- peersReplicatedTo,
- blockId,
- numPeersToReplicateTo - peersReplicatedTo.size)
- }
- }
- logDebug(s"Replicating $blockId of ${data.size} bytes to " +
- s"${peersReplicatedTo.size} peer(s) took ${(System.nanoTime - startTime) / 1e6} ms")
- if (peersReplicatedTo.size < numPeersToReplicateTo) {
- logWarning(s"Block $blockId replicated to only " +
- s"${peersReplicatedTo.size} peer(s) instead of $numPeersToReplicateTo peers")
- }
- logDebug(s"block $blockId replicated to ${peersReplicatedTo.mkString(", ")}")
- }
- /**
- * Read a block consisting of a single object.
- */
- def getSingle[T: ClassTag](blockId: BlockId): Option[T] = {
- get[T](blockId).map(_.data.next().asInstanceOf[T])
- }
- /**
- * Write a block consisting of a single object.
- *
- * @return true if the block was stored or false if the block was already stored or an
- * error occurred.
- */
- def putSingle[T: ClassTag](
- blockId: BlockId,
- value: T,
- level: StorageLevel,
- tellMaster: Boolean = true): Boolean = {
- putIterator(blockId, Iterator(value), level, tellMaster)
- }
- /**
- * Drop a block from memory, possibly putting it on disk if applicable. Called when the memory
- * store reaches its limit and needs to free up space.
- *
- * If `data` is not put on disk, it won't be created.
- *
- * The caller of this method must hold a write lock on the block before calling this method.
- * This method does not release the write lock.
- *
- * @return the block's new effective StorageLevel.
- */
- private[storage] override def dropFromMemory[T: ClassTag](
- blockId: BlockId,
- data: () => Either[Array[T], ChunkedByteBuffer]): StorageLevel = {
- logInfo(s"Dropping block $blockId from memory")
- val info = blockInfoManager.assertBlockIsLockedForWriting(blockId)
- var blockIsUpdated = false
- val level = info.level
- // Drop to disk, if storage level requires
- if (level.useDisk && !diskStore.contains(blockId)) {
- logInfo(s"Writing block $blockId to disk")
- data() match {
- case Left(elements) =>
- diskStore.put(blockId) { channel =>
- val out = Channels.newOutputStream(channel)
- serializerManager.dataSerializeStream(
- blockId,
- out,
- elements.toIterator)(info.classTag.asInstanceOf[ClassTag[T]])
- }
- case Right(bytes) =>
- diskStore.putBytes(blockId, bytes)
- }
- blockIsUpdated = true
- }
- // Actually drop from memory store
- val droppedMemorySize =
- if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
- val blockIsRemoved = memoryStore.remove(blockId)
- if (blockIsRemoved) {
- blockIsUpdated = true
- } else {
- logWarning(s"Block $blockId could not be dropped from memory as it does not exist")
- }
- val status = getCurrentBlockStatus(blockId, info)
- if (info.tellMaster) {
- reportBlockStatus(blockId, status, droppedMemorySize)
- }
- if (blockIsUpdated) {
- addUpdatedBlockStatusToTaskMetrics(blockId, status)
- }
- status.storageLevel
- }
- /**
- * Remove all blocks belonging to the given RDD.
- *
- * @return The number of blocks removed.
- */
- def removeRdd(rddId: Int): Int = {
- // TODO: Avoid a linear scan by creating another mapping of RDD.id to blocks.
- logInfo(s"Removing RDD $rddId")
- val blocksToRemove = blockInfoManager.entries.flatMap(_._1.asRDDId).filter(_.rddId == rddId)
- blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster = false) }
- blocksToRemove.size
- }
- /**
- * Remove all blocks belonging to the given broadcast.
- */
- def removeBroadcast(broadcastId: Long, tellMaster: Boolean): Int = {
- logDebug(s"Removing broadcast $broadcastId")
- val blocksToRemove = blockInfoManager.entries.map(_._1).collect {
- case bid @ BroadcastBlockId(`broadcastId`, _) => bid
- }
- blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster) }
- blocksToRemove.size
- }
- /**
- * Remove a block from both memory and disk.
- */
- def removeBlock(blockId: BlockId, tellMaster: Boolean = true): Unit = {
- logDebug(s"Removing block $blockId")
- blockInfoManager.lockForWriting(blockId) match {
- case None =>
- // The block has already been removed; do nothing.
- logWarning(s"Asked to remove block $blockId, which does not exist")
- case Some(info) =>
- removeBlockInternal(blockId, tellMaster = tellMaster && info.tellMaster)
- addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
- }
- }
- /**
- * Internal version of [[removeBlock()]] which assumes that the caller already holds a write
- * lock on the block.
- */
- private def removeBlockInternal(blockId: BlockId, tellMaster: Boolean): Unit = {
- // Removals are idempotent in disk store and memory store. At worst, we get a warning.
- val removedFromMemory = memoryStore.remove(blockId)
- val removedFromDisk = diskStore.remove(blockId)
- if (!removedFromMemory && !removedFromDisk) {
- logWarning(s"Block $blockId could not be removed as it was not found on disk or in memory")
- }
- blockInfoManager.removeBlock(blockId)
- if (tellMaster) {
- reportBlockStatus(blockId, BlockStatus.empty)
- }
- }
- private def addUpdatedBlockStatusToTaskMetrics(blockId: BlockId, status: BlockStatus): Unit = {
- if (conf.get(config.TASK_METRICS_TRACK_UPDATED_BLOCK_STATUSES)) {
- Option(TaskContext.get()).foreach { c =>
- c.taskMetrics().incUpdatedBlockStatuses(blockId -> status)
- }
- }
- }
- def releaseLockAndDispose(
- blockId: BlockId,
- data: BlockData,
- taskAttemptId: Option[Long] = None): Unit = {
- releaseLock(blockId, taskAttemptId)
- data.dispose()
- }
- def stop(): Unit = {
- blockTransferService.close()
- if (shuffleClient ne blockTransferService) {
- // Closing should be idempotent, but maybe not for the NioBlockTransferService.
- shuffleClient.close()
- }
- remoteBlockTempFileManager.stop()
- diskBlockManager.stop()
- rpcEnv.stop(slaveEndpoint)
- blockInfoManager.clear()
- memoryStore.clear()
- futureExecutionContext.shutdownNow()
- logInfo("BlockManager stopped")
- }
- }
- private[spark] object BlockManager {
- private val ID_GENERATOR = new IdGenerator
- def blockIdsToHosts(
- blockIds: Array[BlockId],
- env: SparkEnv,
- blockManagerMaster: BlockManagerMaster = null): Map[BlockId, Seq[String]] = {
- // blockManagerMaster != null is used in tests
- assert(env != null || blockManagerMaster != null)
- val blockLocations: Seq[Seq[BlockManagerId]] = if (blockManagerMaster == null) {
- env.blockManager.getLocationBlockIds(blockIds)
- } else {
- blockManagerMaster.getLocations(blockIds)
- }
- val blockManagers = new HashMap[BlockId, Seq[String]]
- for (i <- until blockIds.length) {
- blockManagers(blockIds(i)) = blockLocations(i).map(_.host)
- }
- blockManagers.toMap
- }
- private class ShuffleMetricsSource(
- override val sourceName: String,
- metricSet: MetricSet) extends Source {
- override val metricRegistry = new MetricRegistry
- metricRegistry.registerAll(metricSet)
- }
- class RemoteBlockTempFileManager(blockManager: BlockManager)
- extends TempFileManager with Logging {
- private class ReferenceWithCleanup(file: File, referenceQueue: JReferenceQueue[File])
- extends WeakReference[File](file, referenceQueue) {
- private val filePath = file.getAbsolutePath
- def cleanUp(): Unit = {
- logDebug(s"Clean up file $filePath")
- if (!new File(filePath).delete()) {
- logDebug(s"Fail to delete file $filePath")
- }
- }
- }
- private val referenceQueue = new JReferenceQueue[File]
- private val referenceBuffer = Collections.newSetFromMap[ReferenceWithCleanup](
- new ConcurrentHashMap)
- private val POLL_TIMEOUT =
- @volatile private var stopped = false
- private val cleaningThread = new Thread() { override def run() { keepCleaning() } }
- cleaningThread.setDaemon(true)
- cleaningThread.setName("RemoteBlock-temp-file-clean-thread")
- cleaningThread.start()
- override def createTempFile(): File = {
- blockManager.diskBlockManager.createTempLocalBlock()._2
- }
- override def registerTempFileToClean(file: File): Boolean = {
- referenceBuffer.add(new ReferenceWithCleanup(file, referenceQueue))
- }
- def stop(): Unit = {
- stopped = true
- cleaningThread.interrupt()
- cleaningThread.join()
- }
- private def keepCleaning(): Unit = {
- while (!stopped) {
- try {
- Option(referenceQueue.remove(POLL_TIMEOUT))
- .map(_.asInstanceOf[ReferenceWithCleanup])
- .foreach { ref =>
- referenceBuffer.remove(ref)
- ref.cleanUp()
- }
- } catch {
- case _: InterruptedException =>
- // no-op
- case NonFatal(e) =>
- logError("Error in cleaning thread", e)
- }
- }
- }
- }
- }
6.7 块管理器BlockManager的更多相关文章
- Spark源码剖析 - SparkContext的初始化(八)_初始化管理器BlockManager
8.初始化管理器BlockManager 无论是Spark的初始化阶段还是任务提交.执行阶段,始终离不开存储体系.Spark为了避免Hadoop读写磁盘的I/O操作成为性能瓶颈,优先将配置信息.计算结 ...
- Python 上下文管理器和else块
最终,上下文管理器可能几乎与子程序(subroutine)本身一样重要.目前,我们只了解了上下文管理器的皮毛--Basic 语言有with 语句,而且很多语言都有.但是,在各种语言中 with 语句的 ...
- 第15章 上下文管理器和else块
#<流流畅的Python>第15章 上下文管理器和else块 #15.1 先做这个,再做那个:if语句之外的else块 #else子句不仅能在if语句中使用,还能在for.while和tr ...
- 流畅的python第十五章上下文管理器和else块学习记录
with 语句和上下文管理器for.while 和 try 语句的 else 子句 with 语句会设置一个临时的上下文,交给上下文管理器对象控制,并且负责清理上下文.这么做能避免错误并减少样板代码, ...
- 上下文管理器和else块
一.if 语句之外的 else块 else 子句不仅能在 if 语句中使用,还能在for.while和try语句中使用. (1)for :仅当 for 循环运行完毕时(即 for 循环没有被break ...
- spark内存管理器--MemoryManager源码解析
MemoryManager内存管理器 内存管理器可以说是spark内核中最重要的基础模块之一,shuffle时的排序,rdd缓存,展开内存,广播变量,Task运行结果的存储等等,凡是需要使用内存的地方 ...
- Node.js包管理器Yarn的入门介绍与安装
FAST, RELIABLE, AND SECURE DEPENDENCY MANAGEMENT. 就在昨天, Facebook 发布了新的 node.js 包管理器 Yarn 用以替代 npm .咱 ...
- python2.7高级编程 笔记一(Python中的with语句与上下文管理器学习总结)
0.关于上下文管理器上下文管理器是可以在with语句中使用,拥有__enter__和__exit__方法的对象. with manager as var: do_something(var) 相当于以 ...
- [连载]《C#通讯(串口和网络)框架的设计与实现》-4.设备驱动管理器的设计
目 录 第四章 设备驱动管理器的设计... 2 4.1 接口定义... 2 4.2 设备容器... 7 4.3 ...
随机推荐
- [linux] C语言Linux系统编程-socket开发
struct sockaddr_in serv_addr; 1.定义结构体变量,结构体是一种数据类型,那么就可以用它来定义变量 2.struct 结构体名 变量名; (struct sockaddr* ...
- [Linux]C语言Linux系统编程创建进程
1.进程ID 每一个进程都由一个唯一的标识符表示,即进程ID,简称pid.系统保证在某时刻每个pid都是唯一的. 1.1分配进程ID 缺省情况下,内核将进程ID的最大值限制为32768,可以在此处设置 ...
- GPUImage使用
GPUImage项目下载地址:https://github.com/BradLarson/GPUImage.git 下载项目时如果下载不下来可以直接check一份(之前下载了好多次都是下载失败,最后没 ...
- 三、synchronized同步锁
一.简介 在Java多线程中,我们要实现同步串行最早接触的就是synchronized关键字. 基本语法如下: synchronized(锁) { // 代码块 } sychronized关键字的锁主 ...
- GIT 基础-基础命令
环境 centos7 1.安装 #yum install git 2.创建本地仓库 ( 这里用 /www/git) 这里里有个隐藏的文件夹 ```.git``` 为git仓库的配置文件夹, 不可随意修 ...
- python模块之numpy与pandas
一.numpy numpy是python数据分析和机器学习的基础模块之一.它有两个作用:1.区别于list列表,提供了数组操作.数组运算.以及统计分布和简单的数学模型:2.计算速度快[甚至要由于pyt ...
- Fill Table Row(it’s an IQ test question)
Here is a table include the 2 rows. And the cells in the first row have been filled with 0~4. Now yo ...
- HTML中的Head标签学习
在页面加载完成的时候,标签head里的内容,是不会在页面中显示出来的.它包含了像页面的<title>(标题) ,CSS(如果你想用CSS来美化页面内容),图标和其他的元数据(比如 作者,关 ...
- Akka - Basis for Distributed Computing
Some concepts as blow: Welcome to Akka, a set of open-source libraries for designing scalable, resil ...
- 颤振错误:当前Flutter SDK版本为2.1.0-dev.0.0.flutter-be6309690f?
我刚刚升级了我的扑动,升级后我无法在Android Studio上运行任何扑动项目.我收到此错误消息. The current Dart SDK version -dev.0.0.flutter-be ...