Hadoop fs -put bandwidth 暴力版

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

// scalastyle:off println

package com.weibo.tools

import java.io.{BufferedInputStream,FileInputStream}

import java.net.URI

import java.io.BufferedInputStream

import java.util.concurrent.TimeUnit

import org.apache.hadoop.conf.{Configuration => hdfsConfig}

import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import org.apache.hadoop.io.IOUtils

import org.apache.spark.{SparkConf, SparkContext}

object Bandwidthlimited_local2HDFS_Writer {

  val kiloByte = 1024

  def upload_one_buffer(inStream : java.io.BufferedInputStream,

    outputStream : org.apache.hadoop.fs.FSDataOutputStream,

    log_buffer : Array[Byte],

    pre_buffer_sum : Long,

    totalSize : Long

  ) : Long = {

    val readSize = inStream.read(log_buffer)

    val buffer_sum = pre_buffer_sum + readSize

    outputStream.write(log_buffer.splitAt(readSize)._1)

    outputStream.flush

    TimeUnit.MILLISECONDS.sleep(999)

    // println(s"${inStream} uploading. ${buffer_sum} uploaded. readSize : ${readSize}. ${buffer_sum * 100 / totalSize}% finished. ")

    buffer_sum

  }

  def LocalLog2HDFS_Writer(sc : SparkContext,

    localSrcPath : String,

    remoteTarPath : String,

    bandwidth : String

  ) : Long = {

    sc.hadoopConfiguration.setBoolean("dfs.support.append",true)

    val hdfs = FileSystem.get(new URI("/"), sc.hadoopConfiguration)

    val filePath = new Path(remoteTarPath)

    val inStream = new BufferedInputStream(new FileInputStream(localSrcPath))

    val totalSize = inStream.available

    hdfs.exists(filePath) match {

      case false => hdfs.create(filePath).close

      case true => println(hdfs.getFileStatus(filePath).toString)

    }

    val outputStream = hdfs.append(filePath)

    val buffer_size = kiloByte * bandwidth.toInt

    val log_buffer = new Array[Byte](buffer_size)

    var buffer_sum = 0L

    try {

        while(inStream.available >= buffer_size) {

          val readSize = inStream.read(log_buffer)

          buffer_sum += readSize

          outputStream.write(log_buffer.splitAt(readSize)._1)

          outputStream.flush

          outputStream.hflush

          println(s"${localSrcPath} uploading. ${buffer_sum} uploaded. readSize : ${readSize}. ${buffer_sum * 100 / totalSize}% finished. ")

          TimeUnit.MILLISECONDS.sleep(999)

        }

        if(inStream.available > 0) {

          val readSize = inStream.read(log_buffer)

          buffer_sum += readSize

          outputStream.write(log_buffer.splitAt(readSize)._1)

          outputStream.flush

          println(s"${localSrcPath} uploading. ${buffer_sum} uploaded. readSize : ${readSize}. ${buffer_sum * 100 / totalSize}% finished. ")

        }

      } finally {

        inStream.close

        outputStream.close

      }

      buffer_sum

  }

  def Local2HDFS_Writer(sc : SparkContext, args: Array[String]) : Long = {

    val helper_info = """    the file localSrcPath pointed limited 1.999G

    Bandwidthlimited_local2HDFS_Writer localSrcPath remoteTarPath bandwidth=10K(by KB)"""

    println(helper_info)

    require(args.size >= 3, helper_info)

    val localSrcPath = args(0)

    val remoteTarPath = args(1)

    val bandwidth = args(2)

    LocalLog2HDFS_Writer(sc, localSrcPath, remoteTarPath, bandwidth)

  }

  def LocalLogReducer2HDFS(sc : SparkContext, taskList : List[(String, String)], bandwidth : String) : Int = {

    var sum = 0

    taskList.iterator.map{

      case (localSrcPath, remoteTarPath) =>

      LocalLog2HDFS_Writer(sc, localSrcPath, remoteTarPath, bandwidth)

      sum += 1

    }

    sum

  }

  def LocalLogReducer(sc : SparkContext, srcParentPath : String, bandwidth : String) = {}

  def main(args: Array[String]) {

    val conf = new SparkConf()

      .setAppName("Bandwidthlimited_local2HDFS_Writer")

      .setMaster("local[1]")

    val sc = new SparkContext(conf)

    Local2HDFS_Writer(sc, args)

    sc.stop()

  }

}

https://github.com/Suanec/Betn_repo/blob/32d56acd3b57efc15573389619ed7793efdf298c/joyCodes/assembly_lib/src/main/scala/Bandwidthlimited_local2HDFS_Writer.scala

暴力破解版，为了优先实现功能，利用Spark + Scala依托于Hadoop API，实现了一个上传限速的功能。存在的问题：

1. hdfs 官方说append本身是不安全的，不建议使用在生产环境中。

2. 限制网速是通过限制流的读写来实现的，可能会出现网速震荡，但平均值符合预期。

3. 网速限制以KB为单位，请留意。

4. 文件大小受限于读入流的问题，目前仅能保证1.999G文件正常使用，超过后可能出现，进度监控失败，重复上传，乱码等问题。

Hadoop fs -put bandwidth 暴力版的更多相关文章

Hadoop介绍及最新稳定版Hadoop 2.4.1下载地址及单节点安装
Hadoop介绍 Hadoop是一个能对大量数据进行分布式处理的软件框架.其基本的组成包括hdfs分布式文件系统和可以运行在hdfs文件系统上的MapReduce编程模型,以及基于hdfs和MapR ...
执行hadoop fs -ls时出现错误RuntimeException: core-site.xml not found
由于暴力关机,Hadoop fs -ls 出现了下图问题: 问题出现的原因是下面红框框里面的东西,我当时以为从另一个节点下载一个conf.cloudera.yarn文件就能解决问题,发现不行啊,于是删 ...
【转】Hadoop FS Shell命令
FS Shell 调用文件系统(FS)Shell命令应使用 bin/hadoop fs <args> 的形式. 所有的的FS shell命令使用URI路径作为参数.URI格式是scheme ...
hadoop fs 命令
1,hadoop fs –fs [local | <file system URI>]:声明hadoop使用的文件系统,如果不声明的话,使用当前配置文件配置的,按如下顺序查找:hadoop ...
hadoop fs -mkdir testdata错误提示No such file or directory
解决方法: hadoop fs -mkdir -p testdata
Hadoop FS shell commands
命令格式:hadoop fs -command -option args appendToFileUsage: hadoop fs -appendToFile <localsrc> ... ...
何时使用hadoop fs、hadoop dfs与hdfs dfs命令(转)
hadoop fs:使用面最广,可以操作任何文件系统. hadoop dfs与hdfs dfs:只能操作HDFS文件系统相关(包括与Local FS间的操作),前者已经Deprecated,一般使用后 ...
hadoop fs管理文件权限
sudo addgroup Hadoop#添加一个hadoop组sudo usermod -a -G hadoop larry#将当前用户加入到hadoop组修改hadoop目录的权限sudo ch ...
HDFS的基本shell操作，hadoop fs操作命令
(1)分布式文件系统随着数据量越来越多,在一个操作系统管辖的范围存不下了,那么就分配到更多的操作系统管理的磁盘中,但是不方便管理和维护,因此迫切需要一种系统来管理多台机器上的文件,这就是分布式文件管 ...

随机推荐

linux下的EDA——VCS使用
原帖地址:https://blog.csdn.net/moon9999/article/details/75283926 在Linux下对verilogHDL进行功能仿真时非常必要的,下面提供两种常见 ...
vue-cli配置多入口多出口，实现一个项目两个访问地址，区分不同上线环境
最近工作中需要把项目分割成两块,一块需要跑在微信中,通过微信jdk获取用户资料默认登录,一部分需要给原生app做webview的内嵌页面,当然这部分内容是不跑在微信中的. 所以我想到了把项目分成两部分 ...
[C#] 解决Silverlight反射安全关键（SecuritySafeCritical）时报“System.MethodAccessException: 安全透明方法 XXX 无法使用反射访问”的问题
作者: zyl910 一.缘由在Silverlight中使用反射动态访问时,经常遇到"System.MethodAccessException: 安全透明方法 XXX 无法使用反射访问-- ...
[dubbo] Dubbo API 笔记——配置参考
schema 配置参考所有配置项分为三大类服务发现:表示该配置项用于服务的注册与发现,目的是让消费方找到提供方服务治理:表示该配置项用于治理服务间的关系,或为开发测试提供便利条件性能调优:表示 ...
android学习十二（android的Content Provider(内容提供器)的使用）
文件存储和SharePreference存储以及数据存储一般为了安全,最好用于当前应用程序中訪问和存储数据.内容提供器(Content Provider)主要用于在不同的应用程序之间实现数据共享的功能 ...
systemctl -- 系统服务管理器【转】
systemctl -- 系统服务管理器 systemctl 是系统服务管理器命令,它实际上将 service 和 chkconfig 这两个命令组合到一起. 直接运行命令可以列出所有正在运行的服务 ...
【C++】C++中assert和ENDEGU预处理语句
assert 断言语句是C++中的一种预处理宏语句,它能在程序运行时根据否定条件中断程序. C++中的assert()函数可以实现断言功能,在使用assert函数之前应该先引入<cassert& ...
MMU内存管理单元
arm-linux学习-(MMU内存管理单元) 什么是MMU MMU(Memory Management Unit)主要用来管理虚拟存储器.物理存储器的控制线路,同时也负责虚拟地址映射为物理地址,以及 ...
Effective Java 第三版——63. 注意字符串连接的性能
Tips 书中的源代码地址:https://github.com/jbloch/effective-java-3e-source-code 注意,书中的有些代码里方法是基于Java 9 API中的,所 ...
Xshell设置密钥登录CentOS6.5_64位（图文版）
一.环境 CentOS6.5 64位 VMware 14 Pro XSHEEL 5 build 1333 宝塔Linux面板二.生成XSHELL密钥三.上传公钥到服务器,并配置 1.上传vCent ...

Hadoop fs -put bandwidth 暴力版

Hadoop fs -put bandwidth 暴力版的更多相关文章

随机推荐

热门专题