本文首发于个人博客https://kezunlin.me/post/bcdfb73c/,欢迎阅读最新内容!

tensorrt fp32 fp16 tutorial with caffe pytorch minist model

Series

Code Example

include headers

#include <assert.h>
#include <sys/stat.h>
#include <time.h> #include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <cmath>
#include <algorithm> #include <cuda_runtime_api.h> #include "NvCaffeParser.h"
#include "NvOnnxConfig.h"
#include "NvOnnxParser.h"
#include "NvInfer.h"
#include "common.h" using namespace nvinfer1;
using namespace nvcaffeparser1; static Logger gLogger; // Attributes of MNIST Caffe model
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
//const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::string mnist_data_dir = "data/mnist/"; // Simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H * INPUT_W])
{
readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

caffe model to tensorrt

void caffeToTRTModel(const std::string& deployFilepath,       // Path of Caffe prototxt file
const std::string& modelFilepath, // Path of Caffe model file
const std::vector<std::string>& outputs, // Names of network outputs
unsigned int maxBatchSize, // Note: Must be at least as large as the batch we want to run with
IHostMemory*& trtModelStream) // Output buffer for the TRT model
{
// Create builder
IBuilder* builder = createInferBuilder(gLogger); // Parse caffe model to populate network, then set the outputs
std::cout << "Reading Caffe prototxt: " << deployFilepath << "\n";
std::cout << "Reading Caffe model: " << modelFilepath << "\n";
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser(); bool useFp16 = builder->platformHasFastFp16();
std::cout << "platformHasFastFp16: " << useFp16 << "\n"; bool useInt8 = builder->platformHasFastInt8();
std::cout << "platformHasFastInt8: " << useInt8 << "\n"; // create a 16-bit model if it's natively supported
DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFilepath.c_str(),
modelFilepath.c_str(),
*network,
modelDataType);
// Specify output tensors of network
// ERROR: Network must have at least one output
for (auto& s : outputs){
std::cout<<"output = "<< s.c_str() << std::endl;
network->markOutput(*blobNameToTensor->find(s.c_str())); // prob
} builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20); // set up the network for paired-fp16 format if available
if(useFp16)
builder->setFp16Mode(true); // Build engine
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine); // Destroy parser and network
network->destroy();
parser->destroy(); // Serialize engine and destroy it
trtModelStream = engine->serialize();
engine->destroy();
builder->destroy(); //shutdownProtobufLibrary();
}

pytorch onnx to tensorrt

void onnxToTRTModel( const std::string& modelFilepath,        // name of the onnx model
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with
IHostMemory *&trtModelStream) // output buffer for the TensorRT model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger); nvonnxparser::IOnnxConfig* config = nvonnxparser::createONNXConfig();
config->setModelFileName(modelFilepath.c_str()); nvonnxparser::IONNXParser* parser = nvonnxparser::createONNXParser(*config); //Optional - uncomment below lines to view network layer information
//config->setPrintLayerInfo(true);
//parser->reportParsingInfo(); if (!parser->parse(modelFilepath.c_str(), DataType::kFLOAT))
{
string msg("failed to parse onnx file");
gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
exit(EXIT_FAILURE);
} if (!parser->convertToTRTNetwork()) {
string msg("ERROR, failed to convert onnx network into TRT network");
gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
exit(EXIT_FAILURE);
}
nvinfer1::INetworkDefinition* network = parser->getTRTNetwork(); // Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine); // we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy(); // serialize the engine, then close everything down
trtModelStream = engine->serialize();
engine->destroy();
builder->destroy(); //shutdownProtobufLibrary();
}

do inference

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex, outputIndex; printf("Bindings after deserializing:\n");
for (int bi = 0; bi < engine.getNbBindings(); bi++)
{
if (engine.bindingIsInput(bi) == true)
{
inputIndex = bi;
printf("Binding %d (%s): Input.\n", bi, engine.getBindingName(bi));
} else
{
outputIndex = bi;
printf("Binding %d (%s): Output.\n", bi, engine.getBindingName(bi));
}
} //const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
//const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); std::cout<<"inputIndex = "<< inputIndex << std::endl; // 0 data
std::cout<<"outputIndex = "<< outputIndex << std::endl; // 1 prob // Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream); // Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}

save and load engine

void SaveEngine(const nvinfer1::IHostMemory& trtModelStream, const std::string& engine_filepath)
{
std::ofstream file;
file.open(engine_filepath, std::ios::binary | std::ios::out);
if(!file.is_open())
{
std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
return;
}
file.write((const char*)trtModelStream.data(), trtModelStream.size());
file.close();
}; ICudaEngine* LoadEngine(IRuntime& runtime, const std::string& engine_filepath)
{
ifstream file;
file.open(engine_filepath, ios::binary | ios::in);
file.seekg(0, ios::end);
int length = file.tellg();
file.seekg(0, ios::beg); std::shared_ptr<char> data(new char[length], std::default_delete<char[]>());
file.read(data.get(), length);
file.close(); // runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
ICudaEngine* engine = runtime.deserializeCudaEngine(data.get(), length, nullptr);
assert(engine != nullptr);
return engine;
}

example

void demo_save_caffe_to_trt(const std::string& engine_filepath)
{
std::string deploy_filepath = mnist_data_dir + "mnist.prototxt";
std::string model_filepath = mnist_data_dir + "mnist.caffemodel"; // Create TRT model from caffe model and serialize it to a stream
IHostMemory* trtModelStream{nullptr};
caffeToTRTModel(deploy_filepath, model_filepath, std::vector<std::string>{OUTPUT_BLOB_NAME}, 1, trtModelStream);
assert(trtModelStream != nullptr); SaveEngine(*trtModelStream, engine_filepath); // destroy stream
trtModelStream->destroy();
} void demo_save_onnx_to_trt(const std::string& engine_filepath)
{
std::string onnx_filepath = mnist_data_dir + "mnist.onnx"; // Create TRT model from caffe model and serialize it to a stream
IHostMemory* trtModelStream{nullptr};
onnxToTRTModel(onnx_filepath, 1, trtModelStream);
assert(trtModelStream != nullptr); SaveEngine(*trtModelStream, engine_filepath); // destroy stream
trtModelStream->destroy();
} int mnist_demo()
{
bool use_caffe = false;
std::string engine_filepath;
if (use_caffe){
engine_filepath = "cfg/mnist/caffe_minist_fp32.trt";
demo_save_caffe_to_trt(engine_filepath);
} else {
engine_filepath = "cfg/mnist/onnx_minist_fp32.trt";
demo_save_onnx_to_trt(engine_filepath);
}
std::cout<<"[API] Save engine to "<< engine_filepath <<std::endl; //if (watrix::algorithm::FilesystemUtil::not_exists(engine_filepath)){ const int num = 6;
std::string digit_filepath = mnist_data_dir + std::to_string(num) + ".pgm"; // Read a digit file
uint8_t fileData[INPUT_H * INPUT_W];
readPGMFile(digit_filepath, fileData);
float data[INPUT_H * INPUT_W]; if (use_caffe){ std::string mean_filepath = mnist_data_dir + "mnist_mean.binaryproto";
// Parse mean file
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(mean_filepath.c_str());
parser->destroy(); // Subtract mean from image
const float* meanData = reinterpret_cast<const float*>(meanBlob->getData()); // size 786 for (int i = 0; i < INPUT_H * INPUT_W; i++)
data[i] = float(fileData[i]) - meanData[i]; meanBlob->destroy();
} else { for (int i = 0; i < INPUT_H * INPUT_W; i++)
data[i] = 1.0 - float(fileData[i]/255.0);
} // Deserialize engine we serialized earlier
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr); std::cout<<"[API] Load engine from "<< engine_filepath <<std::endl;
ICudaEngine* engine = LoadEngine(*runtime, engine_filepath);
assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr); // Run inference on input data
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1); // Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy(); // Print histogram of the output distribution
std::cout << "\nOutput:\n\n"; // for onnx,we get z as output, we need to use softmax to get probs
if ( !use_caffe){ //Calculate Softmax
float sum{0.0f};
for(int i = 0; i < OUTPUT_SIZE; i++)
{
prob[i] = exp(prob[i]);
sum += prob[i];
}
for(int i = 0; i < OUTPUT_SIZE; i++)
{
prob[i] /= sum;
}
} // find max probs
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < 10; i++)
{
val = std::max(val, prob[i]);
if (val == prob[i]) {
idx = i;
}
cout << " Prob " << i << " "<< std::fixed << std::setw(5) << std::setprecision(4) << prob[i];
std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
}
std::cout << std::endl; return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
} int main(int argc, char** argv)
{
mnist_demo();
return 0;
}

results

./bin/sample_mnist
[API] Save engine to cfg/mnist/onnx_minist_fp32.trt
[API] Load engine from cfg/mnist/onnx_minist_fp32.trt
Bindings after deserializing:
Binding 0 (Input3): Input.
Binding 1 (Plus214_Output_0): Output.
inputIndex = 0
outputIndex = 1 Output: Prob 0 0.00000:
Prob 1 0.00001:
Prob 2 0.00002:
Prob 3 0.00003:
Prob 4 0.00004:
Prob 5 0.00005:
Prob 6 1.00006: **********
Prob 7 0.00007:
Prob 8 0.00008:
Prob 9 0.00009:

Reference

History

  • 20190422 created.

Copyright

使用TensorRT对caffe和pytorch onnx版本的mnist模型进行fp32和fp16 推理 | tensorrt fp32 fp16 tutorial with caffe pytorch minist model的更多相关文章

  1. Ubuntu14.04+caffe+cuda7.5 环境搭建以及MNIST数据集的训练与测试

    Ubuntu14.04+caffe+cuda 环境搭建以及MNIST数据集的训练与测试 一.ubuntu14.04的安装: ubuntu的安装是一件十分简单的事情,这里给出一个参考教程: http:/ ...

  2. [源码解析] PyTorch分布式优化器(3)---- 模型并行

    [源码解析] PyTorch分布式优化器(3)---- 模型并行 目录 [源码解析] PyTorch分布式优化器(3)---- 模型并行 0x00 摘要 0x01 前文回顾 0x02 单机模型 2.1 ...

  3. pytorch加载和保存模型

    在模型完成训练后,我们需要将训练好的模型保存为一个文件供测试使用,或者因为一些原因我们需要继续之前的状态训练之前保存的模型,那么如何在PyTorch中保存和恢复模型呢? 方法一(推荐): 第一种方法也 ...

  4. PyTorch如何构建深度学习模型?

    简介 每过一段时间,就会有一个深度学习库被开发,这些深度学习库往往可以改变深度学习领域的景观.Pytorch就是这样一个库. 在过去的一段时间里,我研究了Pytorch,我惊叹于它的操作简易.Pyto ...

  5. Windows10+Anaconda+PyTorch(cpu版本)环境搭建

    1.安装Anaconda,具体参考网上相关教程 2.安装PyTorch 2.1 在Anaconda自带的Anaconda Prompt中创建名为PyTorch的虚拟环境[conda create -- ...

  6. win10+vs2015编译caffe的cpu debug版本、部署matcaffe

    一.编译caffe 1.安装python-3.5.2-amd64.exe https://www.python.org/ftp/python/3.5.2/python-3.5.2-amd64.exe ...

  7. ubuntu查看安装的pytorch/cuda版本

    使用命令: user@home:~$ python Python |Anaconda custom (-bit)| ( , ::) [GCC ] on linux Type "help&qu ...

  8. Nanodet模型部署(ncnn,openvino)/YOLOX部署(TensorRT)

    Nanodet模型部署(ncnn,openvino) nanodet官方代码库nanodet 1. nanodet模型部署在openvino上 step1: 参考链接 nanodet官方demo op ...

  9. caffe初步实践---------使用训练好的模型完成语义分割任务

    caffe刚刚安装配置结束,乘热打铁! (一)环境准备 前面我有两篇文章写到caffe的搭建,第一篇cpu only ,第二篇是在服务器上搭建的,其中第二篇因为硬件环境更佳我们的步骤稍显复杂.其实,第 ...

随机推荐

  1. 题解 CF600E 【Lomsat gelral】

    没有多少人用莫队做吗? 蒟蒻水一波莫队 这是一道树上莫队好题. 时间复杂度(\(n\sqrt{n}logn\)) 蒟蒻过菜,不会去掉logn的做法qaq 思路很简单: 1.dfs跑一下树上点的dfs序 ...

  2. VirtualBox NAT Network配置

    VirtualBox NAT Network配置(OSX上的) VirtualBox的5种连接方式 NAT :虚拟机之间不能互通 NAT网络 :本文对象 桥接 :一般情况下虚拟机无法设置静态IP,并且 ...

  3. 简单多层神经网络实现异或XOR

    最近在看<Neural Network Design_Hagan> 然后想自己实现一个XOR 的网络. 由于单层神经网络不能将异或的判定分为两类. 根据 a^b=(a&~b)|(~ ...

  4. OptimalSolution(8)--位运算

    一.不用额外变量交换两个整数的值 如果给定整数a和b,用以下三行代码即可交换a和b的值.a = a ^ b; b = a ^ b; a = a ^ b; a = a ^ b :假设a异或b的结果记为c ...

  5. 文件读取(filestream)

    在stream中已经介绍过,文件读取应用filestream,其是以字节为单位读取文件的.在操作中,当应用filestream创建文件流,读取时应先定义一个字节数组,在转化成char类型,最后转化成s ...

  6. NetworkManager网络通讯_问题汇总(四)

    此篇来填坑,有些坑是unet自身问题,而大部分则是理解不准确造成的(或者unity定义太复杂) 问题一: isLocalPlayer 值一直是false 出现场景:NetworkLobbyPlayer ...

  7. InfluxDB常见疑问与解答 - 数据写入时如何在表级别指定保留策略

    网友Siguoei:我想让一个库中不同的measurment能够指定不同的保存策略.而不是写入时使用数据库的默认保留策略. Answer:这个特性InfluxDB支持的,写入时序数据时,在行协议前加上 ...

  8. MySQL字符集与排序规则总结

      字符集与排序规则概念 在数据库当中都有字符集和排序规则的概念, 很多开发人员甚至包括有些DBA都会将这个混淆,当然这个情况也有一些情有可原的原因.一来两者本来就是相辅相成,相互依赖关联: 另外一方 ...

  9. NOIP模拟测试23

    这次考试又一次暴露了我很大的问题. 首先做的比较好的是这几次考试一分没挂, 但是,这也体现了更大的问题,那就是我的实力似乎也仅限于此了. 考试先拿满了暴力分(100+0+50),然后看了看T2没看懂, ...

  10. python学习之【第五篇】:Python中的元组及其所具有的方法

    1.前言 Python的元组(tuple)与列表很相似,不同之处在于元组不能被修改,即元组一旦创建,就不能向元组中的增加新元素,不能删除元素中的元素,更不能修改元组中元素.但是元组可以访问任意元素,可 ...