#写libsvm格式
数据 write libsvm

 
 

#!/usr/bin/env python

#coding=gbk

# ==============================================================================

# \file gen-records.py

# \author chenghuige

# \date 2016-08-12 11:52:01.952044

# \Description

# ==============================================================================

 
 

 

from
__future__
import absolute_import

from
__future__
import division

#from __future__ import print_function

 
 

import
sys,os

 
 

import
tensorflow
as
tf

import
numpy
as
np

 
 

flags = tf.app.flags

FLAGS = flags.FLAGS

 
 

_float_feature = lambda
v: tf.train.Feature(float_list=tf.train.FloatList(value=v))

 
 

_int_feature = lambda
v: tf.train.Feature(int64_list=tf.train.Int64List(value=v))

 
 

#how to store global info, using sequence example?

def main(argv):

writer = tf.python_io.TFRecordWriter(argv[2])

for line in open(argv[1]):

l = line.rstrip().split()

label = int(l[0])

 

start = 1

num_features = 0

if
':'
not
in l[1]:

num_features = int(l[1])

start += 1

 

indexes = []

values = []

 

for item in l[start:]:

index,value = item.split(':')

indexes.append(int(index))

values.append(float(value))

 

example = tf.train.Example(features=tf.train.Features(feature={

'label': _int_feature([label]),

'num_features': _int_feature

'index': _int_feature(indexes),

'value': _float_feature(values)

}))

writer.write(example.SerializeToString())

 
 

if __name__ == '__main__':

tf.app.run()

 
 

 
 

#读libsvm格式
数据 read libsvm

 
 

#!/usr/bin/env python

#coding=gbk

# ==============================================================================

# \file read-records.py

# \author chenghuige

# \date 2016-07-19 17:09:07.466651

# \Description

# ==============================================================================

 
 

#@TODO treat comment as sparse input ?

 

from
__future__
import absolute_import

from
__future__
import division

#from __future__ import print_function

 
 

import
sys, os, time

import
tensorflow
as
tf

 
 

import
numpy
as
np

 
 

flags = tf.app.flags

FLAGS = flags.FLAGS

 
 

flags.DEFINE_integer('batch_size', 5, 'Batch size.')

flags.DEFINE_integer('num_epochs', 10, 'Number of epochs to run trainer.')

flags.DEFINE_integer('num_preprocess_threads', 12, '')

 
 

MIN_AFTER_DEQUEUE = 10000

 
 

def read(filename_queue):

reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)

return serialized_example

 
 

def decode(batch_serialized_examples):

features = tf.parse_example(

batch_serialized_examples,

features={

'label' : tf.FixedLenFeature([], tf.int64),

'index' : tf.VarLenFeature(tf.int64),

'value' : tf.VarLenFeature(tf.float32),

})

 
 

label = features['label']

index = features['index']

value = features['value']

 
 

return label, index, value

 
 

def batch_inputs(files, batch_size, num_epochs = None, num_preprocess_threads=1):

"""Reads input data num_epochs times.

"""

if
not
num_epochs: num_epochs = None

 
 

with
tf.name_scope('input'):

filename_queue = tf.train.string_input_producer(

files, num_epochs=num_epochs)

 
 

serialized_example = read(filename_queue)

batch_serialized_examples = tf.train.shuffle_batch(

[serialized_example],

batch_size=batch_size,

num_threads=num_preprocess_threads,

capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,

# Ensures a minimum amount of shuffling of examples.

min_after_dequeue=MIN_AFTER_DEQUEUE)

 
 

return decode(batch_serialized_examples)

 
 

def read_records():

# Tell TensorFlow that the model will be built into the default Graph.

with
tf.Graph().as_default():

# Input images and labels.

tf_record_pattern = sys.argv[1]

data_files = tf.gfile.Glob(tf_record_pattern)

label, index, value = batch_inputs(data_files,

batch_size=FLAGS.batch_size,

num_epochs=FLAGS.num_epochs,

num_preprocess_threads=FLAGS.num_preprocess_threads)

 
 

# The op for initializing the variables.

init_op = tf.group(tf.initialize_all_variables(),

tf.initialize_local_variables())

 
 

# Create a session for running operations in the Graph.

#sess = tf.Session()

sess = tf.InteractiveSession()

#init_op = tf.initialize_all_variables()

#self.session.run(init)

 
 

# Initialize the variables (the trained variables and the

# epoch counter).

sess.run(init_op)

 
 

# Start input enqueue threads.

coord = tf.train.Coordinator()

threads = tf.train.start_queue_runners(sess=sess, coord=coord)

 
 

try:

step = 0

while
not coord.should_stop():

start_time = time.time()

label_, index_, value_ = sess.run([label, index, value])

print label_

print index_

print value_

print index_[0]

print index_[1]

print index_[2]

duration = time.time() - start_time

step += 1

except
tf.errors.OutOfRangeError:

print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))

finally:

# When done, ask the threads to stop.

coord.request_stop()

 
 

# Wait for threads to finish.

coord.join(threads)

sess.close()

 
 

 
 

def main(_):

read_records()

 
 

 
 

if __name__ == '__main__':

tf.app.run()

 
 

#文本分类 text classification

https://github.com/chenghuige/tensorflow-example

 
 

using TfRecord only need small modification, like below, I will update the code in github soon.

 
 

class
SparseClassificationTrainer(object):

"""General framework for Sparse BinaryClassificationTrainer

 
 

Sparse BinaryClassfiction will use sparse embedding look up trick

see https://github.com/tensorflow/tensorflow/issues/342

"""

def __init__(self, dataset = None, num_features = 0):

if
dataset
is
not
None
and
type(dataset) != TfDataSet:

self.labels = dataset.labels

self.features = dataset.features

self.num_features = dataset.num_features

self.num_classes = dataset.num_classes

else:

self.features = SparseFeatures()

self.num_features = num_features

self.num_classes = None

 
 

self.index_only = False

self.total_features = self.num_features

 
 

if
type(dataset) != TfDataSet:

self.sp_indices = tf.placeholder(tf.int64, name = 'sp_indices')

self.sp_shape = tf.placeholder(tf.int64, name = 'sp_shape')

self.sp_ids_val = tf.placeholder(tf.int64, name = 'sp_ids_val')

self.sp_weights_val = tf.placeholder(tf.float32, name = 'sp_weights_val')

self.sp_ids = tf.SparseTensor(self.sp_indices, self.sp_ids_val, self.sp_shape)

self.sp_weights = tf.SparseTensor(self.sp_indices, self.sp_weights_val, self.sp_shape)

 
 

self.X = (self.sp_ids, self.sp_weights)

self.Y = tf.placeholder(tf.int32) #same as batch size

else:

self.X = (dataset.index, dataset.value)

self.Y = dataset.label

 

self.type = 'sparse'

 
 

 
 

 
 

MIN_AFTER_DEQUEUE = 10000

def read(filename_queue):

reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)

return serialized_example

 
 

def decode(batch_serialized_examples):

features = tf.parse_example(

batch_serialized_examples,

features={

'label' : tf.FixedLenFeature([], tf.int64),

'index' : tf.VarLenFeature(tf.int64),

'value' : tf.VarLenFeature(tf.float32),

})

 
 

label = features['label']

index = features['index']

value = features['value']

 
 

return label, index, value

 
 

def batch_inputs(files, batch_size, num_epochs=None, num_preprocess_threads=12):

if
not
num_epochs: num_epochs = None

 
 

with tf.name_scope('input'):

filename_queue = tf.train.string_input_producer(

files, num_epochs=num_epochs)

 
 

serialized_example = read(filename_queue)

batch_serialized_examples = tf.train.shuffle_batch(

[serialized_example],

batch_size=batch_size,

num_threads=num_preprocess_threads,

capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,

# Ensures a minimum amount of shuffling of examples.

min_after_dequeue=MIN_AFTER_DEQUEUE)

 
 

return decode(batch_serialized_examples

class
TfDataSet(object):

def __init__(self, data_files):

self.data_files = data_files

#@TODO now only deal sparse input

self.features = SparseFeatures()

self.label = None

 
 

def build_read_graph(self, batch_size):

tf_record_pattern = self.data_files

data_files = tf.gfile.Glob(tf_record_pattern)

self.label, self.index, self.value = batch_inputs(data_files, batch_size)

 
 

 
 

 
 

def next_batch(self, sess):

label, index, value = sess.run([self.label, self.index, self.value])

 
 

trX = (index, value)

trY = label

 
 

return trX, trY

 
 

 
 

 
 

trainset = melt.load_dataset(trainset_file, is_record=FLAGS.is_record)

if FLAGS.is_record:

trainset.build_read_graph(batch_size)

 

step = 0

while
not coord.should_stop():

#self.trainer.X, self.trainer.Y = trainset.next_batch(self.session)

_, cost_, accuracy_ = self.session.run([self.train_op, self.cost, self.accuracy])

if step % 100 == 0:

print
'step:', step, 'train precision@1:', accuracy_,'cost:', cost_

if step % 1000 == 0:

pass

step +=

Tensorflow 处理libsvm格式数据生成TFRecord (parse libsvm data to TFRecord)的更多相关文章

  1. 记录几种有关libsvm格式数据的list和dict用法

    # list元素求和 sum = reduce(lambda x,y: x+y, mylist) # 比较两个 lists 的元素是否完全一致 if all(x==y for x, y in zip( ...

  2. ini格式数据生成与解析具体解释

    ini格式数据生成与解析具体解释 1.ini格式数据长啥样? watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/ ...

  3. 更加清晰的TFRecord格式数据生成及读取

    TFRecords 格式数据文件处理流程 TFRecords 文件包含了 tf.train.Example 协议缓冲区(protocol buffer),协议缓冲区包含了特征 Features.Ten ...

  4. tensorflow制作tfrecord格式数据

    tf.Example msg tensorflow提供了一种统一的格式.tfrecord来存储图像数据.用的是自家的google protobuf.就是把图像数据序列化成自定义格式的二进制数据. To ...

  5. tensorflow学习笔记(10) mnist格式数据转换为TFrecords

    本程序 (1)mnist的图片转换成TFrecords格式 (2) 读取TFrecords格式 # coding:utf-8 # 将MNIST输入数据转化为TFRecord的格式 # http://b ...

  6. iOS开发之JSON格式数据的生成与解析

    本文将从四个方面对IOS开发中JSON格式数据的生成与解析进行讲解: 一.JSON是什么? 二.我们为什么要用JSON格式的数据? 三.如何生成JSON格式的数据? 四.如何解析JSON格式的数据? ...

  7. Android使用DOM生成和输出XML格式数据

    Android使用DOM生成和输出XML格式数据 本文主要简单解说怎样使用DOM生成和输出XML数据. 1. 生成和输出XML数据 代码及凝视例如以下: try { DocumentBuilderFa ...

  8. 转载 -- iOS开发之JSON格式数据的生成与解析

    本文将从四个方面对IOS开发中JSON格式数据的生成与解析进行讲解: 一.JSON是什么? 二.我们为什么要用JSON格式的数据? 三.如何生成JSON格式的数据? 四.如何解析JSON格式的数据? ...

  9. PHP生成和获取XML格式数据

    在做数据接口时,我们通常要获取第三方数据接口或者给第三方提供数据接口,而这些数据格式通常是以XML或者JSON格式传输,本文将介绍如何使用PHP生成XML格式数据供第三方调用以及如何获取第三方提供的X ...

随机推荐

  1. URL.createObjectURL和URL.revokeObjectURL

    一.URL.createObjectURL URL.createObjectURL()方法会根据传入的参数创建一个指向该参数对象的URL. 这个URL的生命仅存在于它被创建的这个文档里,新的对象URL ...

  2. js正则表达式(1)

    定义正则: var reg = new RegExp('a'); //实例化对象,参数是我们想要制定的规var reg = /a/; //简写方法 正则的常用方法: 1.test():在字符串中查找符 ...

  3. java ---- 面试题

    1.java 语言如何进行异常处理,关键字:throws.throw.try.catch.finally分别代表什么意义?finally代码是在return之后还是之前执行? throws是获取异常, ...

  4. Mysql 查看、创建、更改 数据库和表

    一.一探究竟 我想看看有多少个数据库,有多少个表,以及表里有啥东西.那么你可以这样: 图形界面: 命令: 查看多少个数据库:注意 后面带s #查看 SHOW DATABASES; #查看表 USE b ...

  5. 检查Linux服务器性能

    如果你的Linux服务器突然负载暴增,告警短信快发爆你的手机,如何在最短时间内找出Linux性能问题所在? 概述通过执行以下命令,可以在1分钟内对系统资源使用情况有个大致的了解. • uptime• ...

  6. 测试数组push和unshift方法的效率

    先贴代码,之后再来补内容 <!DOCTYPE HTML> <html> <head> <title>测试数组push和unshift方法的效率</ ...

  7. 移动端页面调试神器-browser-sync

    最近公司赶一个项目,是mobile端,之前没怎么做过移动端的开发,这个项目算是个小尝试. 在做项目的过程中,用到了一个神器--browser-sync,在这里分享给大家. 1.静态页面调试 作为前端, ...

  8. [Storm] Storm与asm的恩恩怨怨

    asm的引用冲突 1. Jersey & Storm 0.9.3 jersey 1.8 (which depends on asm 3.0) Storm 0.93 (which depends ...

  9. R语言作为BI中ETL的工具

    R语言作为BI中ETL的工具,增删改 R语言提供了强大的R_package与各种数据库进行数据交互. 外加其强大数据变换清洗函数,为ETL提供一条方便快捷的道路. RODBC ROracal RMys ...

  10. flickrf 分布式主键生成方案【mysql】

    [相关链接:http://blog.csdn.net/bluishglc/article/details/7710738] 具体做法: 1:找两台服务器,分别配置: TicketServer1: au ...