python获取DBLP数据集

#!/usr/bin/python

# -*- coding: UTF-8 -*-

import xml.sax

import io, sys

paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www')

sub_tags = ('publisher', 'journal', 'booktitle')

ret = []

class DBLPHandler(xml.sax.ContentHandler):

    def __init__(self):

        self.id = 1

        self.reset()

    def reset(self):

        self.dup_article = 0

        self.curtag = None

        self.author = ''

        self.title = ''

        self.pages = ''

        self.year = ''

        self.volume = ''

        self.journal = ''

        self.number = ''

        self.url = ''

        self.ee = ''

    def write_to_file(self, filename):

        file_object = file(filename, 'a+')

        for line in ret:

            file_object.write(line.encode('utf8'))

            #file_object.write('\n')

        file_object.close()

    def record_row(self):

        ret.append(u''.join((self.author, self.title, self.year, self.pages, self.journal, self.ee, '\n')).replace(' ', ''))

        #ret.append(self.author + self.title + self.year + self.pages+ self.journal + self.ee)

        #ret.append((self.author, self.title, self.year, self.pages, self.journal, self.ee))

        #print (self.author, self.title, self.year, self.pages)

    def startElement(self, tag, attributes):

        if tag != None and len(tag.strip()) > 0:

            if tag == 'article':

                self.dup_article += 1

            self.curtag = tag

    def endElement(self, tag):

        if tag != None and len(tag.strip()) > 0:

            if tag == 'article':

                self.record_row()

                self.reset()

    def characters(self, content):

        if content != '\n':

            if self.curtag == "title":

                self.title = content.strip()

            elif self.curtag == "author":

                self.author = content.strip()

            elif self.curtag == "year":

                self.year = content.strip()

            elif self.curtag == "ee":

                self.ee = content.strip()

            elif self.curtag == "journal":

                self.journal = content.strip()

            elif self.curtag == "pages":

                self.pages = content.strip()

            elif self.url == "url":

                self.url = content.strip()

            elif self.number == "number":

                self.number = content.strip()

            elif self.number == "volume":

                self.volume = content.strip()

if (__name__ == "__main__"):

    filename = 'dblp.xml'

    if len(sys.argv) == 2:

        filename = sys.argv[1]

    # 创建一个 XMLReader

    parser = xml.sax.make_parser()

    # turn off namepsaces

    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    # 重写 ContextHandler

    Handler = DBLPHandler()

    parser.setContentHandler(Handler)

    parser.parse(filename)

    print 'Parser Complete!'

    Handler.write_to_file('out')

另外附处理DNA数据的脚本程序:

lens_DNA = [0, 1000, 2000, 2500, 500, 1000, 1500, 2000, 2500]

lens_DBLP = [0, 40, 120, 200, 40, 80, 120, 160, 200]

file_id = 1

LINE_MAX = 100

class DNA_Handler:

    def __init__(self):

        self.strn = ''

    def write_to_file(self, filename):

        file_object = open(filename, 'a+')

        file_object.write(self.strn)

        file_object.close()

    def read_file(self, filename):

        fo = open(filename, 'r')

        line = fo.readline()

        self.strn = ''

        file_id = 1

        cnt_lines = 0

        while line and file_id < 9:

            line = line.replace('\n', '')

            self.strn += line

            if len(self.strn) > lens_DNA[file_id]:

                self.strn = self.strn[0: lens_DNA[file_id]] + '\n'

                print self.strn

                if file_id <= 3:

                    self.write_to_file('DNA_N' + str(file_id))

                else:

                    self.write_to_file('DNA_M' + str(file_id - 3))

                self.strn = ''

                cnt_lines += 1

                if cnt_lines >= LINE_MAX:

                    file_id += 1

                    cnt_lines = 0

            line = fo.readline()

        fo.close()

        print 'read_finished!'

class DBLP_Handler:

    def __init__(self):

        self.strn = ''

    def write_to_file(self, filename):

        file_object = open(filename, 'a+')

        file_object.write(self.strn)

        file_object.close()

    def read_file(self, filename):

        fo = open(filename, 'r')

        line = fo.readline()

        self.strn = ''

        file_id = 1

        cnt_lines = 0

        while line and file_id < 9:

            line = line.replace('\n', '')

            self.strn += line

            if len(self.strn) > lens_DBLP[file_id]:

                self.strn = self.strn[0: lens_DBLP[file_id]] + '\n'

                print self.strn

                self.write_to_file('DBLP_' + str(file_id))

                self.strn = ''

                cnt_lines += 1

                if cnt_lines >= LINE_MAX:

                    file_id += 1

                    cnt_lines = 0

            line = fo.readline()

        fo.close()

        print 'read_finished!'

if (__name__ == '__main__'):

    dh = DNA_Handler()

    dh.read_file('human_dna.fa')

    '''

    bblp_h = DBLP_Handler()

    bblp_h.read_file('DBLP_data')

    '''

python获取DBLP数据集的更多相关文章

Python读取MNIST数据集
MNIST数据集获取 MNIST数据集是入门机器学习/模式识别的最经典数据集之一.最早于1998年Yan Lecun在论文: Gradient-based learning applied to do ...
使用shell/python获取hostname/fqdn释疑
一直以来被Linux的hostname和fqdn(Fully Qualified Domain Name)困惑了好久,今天专门抽时间把它们的使用细节弄清了. 一.设置hostname/fqdn 在Li ...
python 获取日期
转载原文:python 获取日期作者:m4774411wang python 获取日期我们需要用到time模块,比如time.strftime方法 time.strftime('%Y-%m-% ...
python获取字母在字母表对应位置的几种方法及性能对比较
python获取字母在字母表对应位置的几种方法及性能对比较某些情况下要求我们查出字母在字母表中的顺序,A = 1,B = 2 , C = 3, 以此类推,比如这道题目 https://project ...
python获取文件大小
python获取文件大小 # !/usr/bin/python3.4 # -*- coding: utf-8 -*- import os # 字节bytes转化kb\m\g def formatSiz ...
python 获取一个列表有多少连续列表
python 获取一个列表有多少连续列表例如有列表 [1,2,3] 那么连续列表就是 [1,2],[2,3],[1,2,3] 程序实现如下: 运行结果:
[python实用代码片段]python获取当前时间的前一天，前一周，前一个月
python获取当前时间的前一天,前一周,前一个月. 实用python的datetime.timedelta方法,避免了有的月份是30和31等不同的情况. 获取前一个月的时间,方法实现:首先datet ...
Python获取目录、文件的注意事项
Python获取指定路径下的子目录和文件有两种方法: os.listdir(dir)和os.walk(dir),前者列出dir目录下的所有直接子目录和文件的名称(均不包含完整路径),如 >> ...
Python 获取网卡 MAC 地址
/*********************************************************************** * Python 获取网卡 MAC 地址 * 说明: ...

随机推荐

[原创]cocos2d-x研习录-第二阶概念类之导演类(CCDirector)
CCDirector类是游戏的组织和控制中心(总指挥),它控制着主屏幕的显示.场景的切换和显示,以及游戏的开始.结束和暂停.它的继承关系图如下: CCDirector继承自基类CCObject, ...
论文笔记之：Conditional Generative Adversarial Nets
Conditional Generative Adversarial Nets arXiv 2014 本文是 GANs 的拓展,在产生和判别时,考虑到额外的条件 y,以进行更加"激烈 ...
js中的this指针（三）
当一个函数并非一个对象的忏悔时,它会被当作一个函数来调用. 此时,函数中的 this 指针被绑定到了全局对象. 后果:方法不能利用内部函数来帮助工作,由于 this 被绑定了错误的值,将无法共享该方法 ...
document.write 方法
如果在文档加载结束后再调用document.write(),那么输出的内容将会重写整个页面. 某次被问及此问题,志之!
Windows Service 开发,安装与调试
Visual Studio.net 2010 Windows Service 开发,安装与调试本示例完成一个每隔一分钟向C:\log.txt文件写入一条记录为例,讲述一个Windows Servic ...
Spring学习 Ioc篇（一）
一直以来忙于项目的开发,Spring虽然不用,一直想系统地学习一下,想看看它的源码,都没有时间,这段时间比较充裕,就索性先把Spring学习下,熟悉各个功能再去探究它内部的实现.就从Ioc篇开始学习. ...
Internetware网构软件（摘抄）
The Internet provides a global open infrastructure for exchanging and sharing of various resources f ...
约瑟夫环（Josehpuse）的模拟
约瑟夫环问题: 0,1,...,n-1这n个数字排成一个圆圈,从数字0开始每次从这个圆圈里删除第m个数字,求出这个圆圈里剩下的最后一个数字. 这里给出以下几种解法, 1.用队列模拟每次将前m-1个元 ...
关于jsp中response.sendRedirect显示错误
今天在jsp中作判断时,当不同条件时利用response.sendRedirect(“url”)来转向不同的页面,首先是判断验证码,当错误时就转向错误页面:当正确时,才进行用户名和密码的判断,同样也r ...
(C#)算法题
1. Convert string from "AAABBCC" to "A3B2C2". 当面试者提出这个问题的时候,首先需要确认题意:譬如:字符串是不是顺序 ...

python获取DBLP数据集

python获取DBLP数据集的更多相关文章

随机推荐

热门专题