Python并发实践_03_并发实战之一

16S数据质控流程，一次下机lane包括很多的项目，每个项目有独立的合同号，一个项目可能包含16S或者ITS两种，通过一个完整的pipeline，将上游拆分好的数据全部整理成可以直接分析的数据。原本这个工作是通过并行的sge实现，是运行层面的并行，现在在程序层面实现并行处理，可以脱离sge系统工作。

 import os

 import sys

 import re

 import time

 import collections

 from multiprocessing import Process,JoinableQueue,Queue,cpu_count

 from threading import Thread

 from settings import primer,pandaseq_soft

 from programs import *

 Result = collections.namedtuple("Result","compact sample_name HQ_fq")

 def parse_sam_barcode_file(sam_barcode_file):

     for line in open(sam_barcode_file):

         yield line.strip().split('\t')

 def proc(compact,sample_name,work_path,lib_method,data_type):

     split_path = '%s/Split'%work_path

     QC_path = '%s/QC'%work_path

     compact_path = '%s/%s'%(QC_path,compact)

     if not os.path.exists(compact_path):

         os.makedirs(compact_path)

     sample_path = '%s/%s'%(compact_path,sample_name)

     if not os.path.exists(sample_path):

         os.makedirs(sample_path)

     original_path = '%s/%s/%s'%(split_path,compact,sample_name)

     (read1,read2) = os.popen('ls %s/*'%original_path).read().strip().split('\n')

     pandaseq_fq = '%s/pandaseq.fq'%sample_path

     pandaseq_log = '%s/pandaseq.log'%sample_path

     pandaseq(pandaseq_soft,read1,read2,pandaseq_fq,primer[lib_method][data_type]['forward'],primer[lib_method][data_type]['reverse'],pandaseq_log)

     high_quality_fq = '%s/high_quality.fq'%sample_path

     high_quality_log = '%s/high_quality.stat'%sample_path

     QC(pandaseq_fq,high_quality_fq,high_quality_log,data_type)

     return Result(compact,sample_name,high_quality_fq)

 def worker(work_path,jobs,results):

     while True:

         try:

             compact,sample_name,lib_method,data_type = jobs.get()

             try:

                 result = proc(compact,sample_name,work_path,lib_method,data_type)

                 sys.stderr.write( 'Process %s is finished doing with compact:%s sample_name:%s\n'%(os.getpid(),compact,sample_name) )

                 results.put(result)

             except:

                 sys.stderr.write('Process %s is FIALED !!! %s/%s may be some problem!\n'%(os.getpid(),compact,sample_name))

                 jobs.put((compact,sample_name,lib_method,data_type))

                 sys.stderr.write('The job is repushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))

         finally:

             jobs.task_done()

 def add_jobs(work_path,sam_barcode_file_list,jobs):

     job_num = 0

     data_type_hash = {}

     for todo,sam_barcode_file in enumerate(sam_barcode_file_list):

         sam_barcode_file = sam_barcode_file.strip()

         if not os.path.isfile(sam_barcode_file):

             continue

         lib_method = get_lib_method(sam_barcode_file)

         if lib_method is None:

             continue

         print 'sam_barcode_file loading: %s             ......  ok\n'%sam_barcode_file

         for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):

         print 'sam_barcode_file loading: %s             ......  ok\n'%sam_barcode_file

         for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):

             if not data_type_hash.has_key(compact):

                 data_type_hash[compact] = {}

             if not data_type_hash[compact].has_key(data_type):

                 data_type_hash[compact][data_type] = []

             data_type_hash[compact][data_type].append(sample_name)

             jobs.put((compact,sample_name,lib_method,data_type))

             job_num += 1

             sys.stderr.write('The job is pushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))

     sys.stderr.write('\n### All %s jobs have been pushed into the queue ###\n'%job_num)

     return data_type_hash

 def create_processes(concurrency,jobs,work_path,results):

     print '\nBegin create jobs with %s Process...\n'%concurrency

     for _ in range(concurrency):

         process = Process(target=worker,args=(work_path,jobs,results))

         process.daemon = True

         process.start()

 def main(work_path,sam_barcode_file_list):

     global concurrency

     split_path = '%s/Split'%work_path

     QC_path = '%s/QC'%work_path

     jobs = JoinableQueue()

     results = Queue()

     canceled = False

     data_type_hash = add_jobs(split_path,sam_barcode_file_list,jobs)

     create_processes(concurrency,jobs,work_path,results)

     try:

         jobs.join()

     except KeyboardInterrupt:

         sys.stderr.write('cancelling ...\n')

         canceled = True

     finally:

         job_num = 0

         finished_hash = {}

         while not results.empty():

             result = results.get_nowait()

             job_num += 1

             if not finished_hash.has_key(result.compact):

                 finished_hash[result.compact] = []

             finished_hash[result.compact].append(result.sample_name)

         sys.stderr.write('all %s work finished!\n\n'%job_num)

         log_out = open('%s/work.log'%QC_path,'w')

         for compact,sample_list in finished_hash.iteritems():

             for sample_name in sample_list:

                 log_out.write('%s\t%s has been finished\n'%(compact,sample_name))

         log_out.close()

     if canceled:

         return False

     for compact in os.listdir(QC_path):

         compact_dir = '%s/%s'%(QC_path,compact)

         if not os.path.isdir(compact_dir):

             continue

         sys.stderr.write('Begin stat compact: %s\n'%compact)

         reads_stat(compact_dir)

     sys.stderr.write('All campact stat finished!\n\n')

     reads_stat_all(QC_path,split_path)

     merge_threads = set()

     for compact,subitem in data_type_hash.iteritems():

         compact_dir = '%s/%s'%(QC_path,compact)

         for data_type,sample_list in subitem.iteritems():

             merged_file = '%s/%s/%s.together.fna'%(QC_path,compact,data_type)

             t = Thread(target=sampleMerge,args=(sample_list,data_type,compact_dir,merged_file))

             merge_threads.add(t)

             t.start()

             while True:

                 if threading.activeCount() < concurrency:

                     break

     for t in threading.enumerate():

         if t in merge_threads:

             t.join()

     sys.stderr.write('\n All pipeline is done ! \n')

 if __name__ == '__main__':

     sys.argv.pop(0)

     if len(sys.argv) < 1:

         sys.stderr.write('Usage: python run_pipeline.py work_path [process_num] \n process_num default is cpu_count\n')

         sys.exit()

     work_path = sys.argv.pop(0)

     work_path = os.path.abspath(work_path)

     sys.stderr.write('Workdir is %s,pipeline begin\n'%work_path)

     sam_barcode_file_list = os.popen('ls %s/Split/sam_barcode.*'%work_path).read().strip().split('\n')

     if len(sys.argv) != 0:

         concurrency = int(sys.argv.pop(0))

     else:

         concurrency = cpu_count()

     main(work_path,sam_barcode_file_list)

下面是一些辅助程序：

 from __future__ import division

 from threading import Thread,Lock

 from multiprocessing import cpu_count

 import threading

 import sys

 import os

 import re

 import types

 from Bio import SeqIO

 from Bio.SeqRecord import SeqRecord

 def fq_reads_num(fq_file):

     wc_out = os.popen('wc -l %s'%fq_file).read().strip()

     result = int(re.search('^(\d+)',wc_out).group(1)) / 4

     return int(result)

 def Q_ave(self):

     Q_sum = 0

     for qlist in  self.letter_annotations.itervalues():

         for q in qlist:

             Q_sum += q

         Q_ave = Q_sum / len(self)

         return Q_ave

 def QC(file,out_file,out_stat_file,data_type):

     SeqRecord.Q_ave = Q_ave

     out_stat = open(out_stat_file,'w')

     out = open(out_file,'w')

     count = 0

     high_count = 0

     for record in SeqIO.parse(open(file),'fastq'):

         count += 1

         if record.Q_ave() < 20:

             continue

         if len(record) < 220 or len(record) > 500:

             continue

         out.write(record.format('fastq'))

         high_count += 1

     high_ratio = high_count / count

     out_stat.write('%s\t%s\t%s\t%s\n'%(data_type,count,high_count,high_ratio))

 class MyList(list):

     def __str__(self):

         out_str = ''

         for item in self:

             out_str += item

             out_str += '\t'

         return out_str.strip()

 def parse_stat(stat_file):

     tabs = os.popen('cat %s'%stat_file).read().strip().split('\t')

     yield tabs

 def parse_stat_files(compact_path):

     for f in os.popen('ls %s/*/*.stat'%compact_path):

         stat_file = f.strip()

         sample_name =  re.search('%s\/(\S+)\/high_quality\.stat'%compact_path,stat_file).group(1)

         yield stat_file,sample_name

 def reads_stat(compact_path):

     out = open('%s/reads_stat.xls'%compact_path,'w')

     sample_reads = {}

     out = open('%s/reads_stat.xls'%compact_path,'w')

     sample_reads = {}

     for stat_file,sample_name in parse_stat_files(compact_path):

         for tabs in parse_stat(stat_file):

             sample_reads[sample_name] = tabs

     out.write('sample_name\tsample_type\traw_reads\tHQ_reads\tHQ_ratio\n')

     for sample,tabs in sample_reads.iteritems():

         tabs = MyList(tabs)

         out.write('%s\t%s\n'%(sample,str(tabs)))

     out.close()

 def raw_stat_thread(fq_file,lock,compact,sample_name,tabs,out):

     global total_reads

 #    sys.stderr.write('thread %s stat with %s %s\n'%(threading.currentThread().ident,compact,sample_name))

     raw_reads = fq_reads_num(fq_file)

     lock.acquire()

     total_reads += raw_reads

     data_type = tabs.pop(0)

     ratio = int(tabs[1]) / raw_reads * 100

     tabs = str(MyList(tabs))

     out.write('%s\t%s\t%s\t%s\t%s\t%2.2f%%\n'%(compact,sample_name,data_type,raw_reads,tabs,ratio))

     lock.release()

 #    sys.stderr.write('thread %s finished doing with %s %s\n'%(threading.currentThread().ident,compact,sample_name))

 total_reads = 0

 def reads_stat_all(work_path,original_path):

     global total_reads

     sys.stderr.write('\nmerge stat is begin ... \n')

     out = open('%s/reads_stat.xls'%work_path,'w')

     compact_hash = {}

     for f in os.listdir(work_path):

         compact = f.strip()

         compact_path = '%s/%s'%(work_path,compact)

         if not os.path.isdir(compact_path):

             continue

         if not compact_hash.has_key(compact):

             compact_hash[compact] = {}

         for stat_file,sample_name in parse_stat_files(compact_path):

             for tabs in parse_stat(stat_file):

                 compact_hash[compact][sample_name] = tabs

     out.write('compact\tsample_name\tdata_type\traw_reads\tpandaseq_reads\tHQ_reads\tratio\n')

     lock = Lock()

     active_threads = set()

     for compact,sample in compact_hash.iteritems():

         sys.stderr.write('doing with %s stat\n'%compact)

         for sample_name,tabs in sample.iteritems():

             original_fq = os.popen('ls %s/%s/%s/*'%(original_path,compact,sample_name)).read().strip().split('\n').pop(0)

             t = Thread(target=raw_stat_thread,args=(original_fq,lock,compact,sample_name,tabs,out))

             active_threads.add(t)

             t.start()

             while True:

                 if threading.activeCount() < cpu_count():

                     break

     out.flush()

     for t in threading.enumerate():

         if t in active_threads:

             sys.stderr.write('thread %s  is still alive, wait ...\n'%t.ident)

             t.join()

     sys.stderr.write('Unaligned stating ...\n')

     out.write('\n###\n')

     unalign_fq = os.popen('ls %s/Unalign/*'%original_path).read().strip().split('\n').pop(0)

     unalign_reads = fq_reads_num(unalign_fq)

     total_reads += unalign_reads

     ratio = unalign_reads / total_reads * 100

     out.write('Unalign\t%s\t%2.2f%%\n'%(unalign_reads,ratio))

     out.close()

     sys.stderr.write('merge stat is all finished!\n\n')

 def pandaseq(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out):

     cmd = '%s -F -f %s -r %s -w %s -p %s -q %s -g %s -l 220 -L 500'%(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out)

     os.system(cmd)

 def sampleMerge(sample_list,data_type,file_path,outfile):

     outhandle = open(outfile,'w')

 #    sys.stderr.write('Begin merge into %s\n'%file_path)

     reads_num = {}

     f_template = '%s/%s/high_quality.fq'

     for sample in sample_list:

         f = f_template%(file_path,sample)

         sample = re.sub('[-_]','.',sample)

         sample = '%s%s'%(data_type,sample)

         if not reads_num.has_key(sample):

             reads_num[sample] = 0

         for record in SeqIO.parse(open(f),'fastq'):

             reads_num[sample] += 1

             outhandle.write('>%s_%s\n%s\n'%(sample,reads_num[sample],str(record.seq)))

     outhandle.close()

     sys.stderr.write('merge file: %s is finished\n'%outfile)

 def get_lib_method(file):

     file = os.path.basename(file)

     if re.match('^sam_barcode.l$',file):

         lib_method = 'Self'

     elif re.match('^sam_barcode.s\d+$',file):

         lib_method = 'HXT'

     else:

         lib_method = None

     return lib_method

settings.py中包含不同建库方式的引物序列。

这个程序也算是前几天的学习成果展示了

Python并发实践_03_并发实战之一的更多相关文章

Python机器学习实践与Kaggle实战（转）
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5 ...
Python并发实践_01_线程与进程初探
进程与线程在多任务处理中,每一个任务都有自己的进程,一个任务会有很多子任务,这些在进程中开启线程来执行这些子任务.一般来说,可以将独立调度.分配的基本单元作为线程运行,而进程是资源拥有的基本单位. ...
Python 多线程教程：并发与并行
转载于: https://my.oschina.net/leejun2005/blog/398826 在批评Python的讨论中,常常说起Python多线程是多么的难用.还有人对 global int ...
Golang 高效实践之并发实践context篇
前言在上篇Golang高效实践之并发实践channel篇中我给大家介绍了Golang并发模型,详细的介绍了channel的用法,和用select管理channel.比如说我们可以用channel来控 ...
Appium+python自动化（三十七）- 士兵突击许三多 - 多个appium服务启动，多个设备启动，多进程并发启动设备-并发测试 - 下（超详解）
简介接着上一篇继续看一下如何并发测试以及并发测试的过程中,可能遇到的问题,在这里宏哥把宏哥遇到的和小伙伴或者童鞋们,一起分享一下. Appium端口检测问题思考经过前面学习,我们已经能够使用py ...
python之socketserver实现并发
python之socketserver实现并发服务端 import socketserver #socketserver模块是用来实现并发 # 我们自己的类里一定要继承socketserver.Ba ...
python多进程并发和多线程并发和协程
为什么需要并发编程? 如果程序中包含I/O操作,程序会有很高的延迟,CPU会处于等待状态,这样会浪费系统资源,浪费时间 1.Python的并发编程分为多进程并发和多线程并发多进程并发:运行多个独立的 ...
python 使用多进程实现并发编程/使用queue进行进程间数据交换
import time import os import multiprocessing from multiprocessing import Queue, pool ""&qu ...
【Scala】Scala多线程-并发实践
Scala多线程-并发实践 scala extends Thread_百度搜索 scala多线程 - 且穷且独立 - 博客园 Scala和并发编程 - Andy Tech Talk - ITeye博客 ...

随机推荐

[Docker基础]Docker安装教程
Install Docker Docker支持几乎所有的Linux发行版,也支持Mac和Windows. 各操作系统的安装方法可参考Docker官网. 安装环境 ubuntu 16.04 Docker ...
mybatis实现延迟加载多对一
1.数据库表 CREATE TABLE `country` ( `cid` ) NOT NULL AUTO_INCREMENT COMMENT '国家id', `cname` ) COLLATE ut ...
NOIP2017提高组初赛解析
首发于订阅号嗨编程,这是一个以嗨为目标的编程订阅号(仅仅是目标而已),扫码可关注,不定期更. 解析中引用了一张关于排序的总结课件图片,来源网络,如果侵权,请联系本人删除(没钱付版权费)
翻煎饼 Stacks of Flapjacks
题意:本题意为煎饼排序,大的放在上面,小的放在下面(此题输入是从上到下输入的),为煎饼排序是通过一系列的"翻转"动作来完成的.翻转动作就是将一个小铲插到一叠煎饼中的某两个煎饼之间, ...
单源最短路径 dijkstra算法实现
本文记录一下dijkstra算法的实现,图用邻接矩阵表示,假设图为无向图.而且连通,有向图,不连通图的做法相似. 算法简述: 首先确定"单源"的源.假设是第0个顶点. 维护三个数组 ...
002Java概述
1Sun(Stanford University Network )公司1995年推出的高级编程语言 2.面向Internet的编程语言 3.已经成为web应用程序的首选开发语言 4.完全面向对象简单 ...
Python笔记·第五章—— 列表（List）的增删改查及其他方法
一.列表的简介列表是python中的基础数据类型之一,其他语言中也有类似于列表的数据类型,比如js中叫数组,他是以[ ]括起来,每个元素以逗号隔开,而且他里面可以存放各种数据类型比如:li = ...
使用Myeclipse为数据表创建hibernate实体对象
hibernate是orm框架的一种,orm即Object Relational Mapping,对象映射关系,其主要作用是将数据库(mysql,mssql,oracle)的对象转换为具体编程语言(如 ...
java类中获取ServletContext的方法
起因是我想要获取一个相对路径,需要用到servletContext的getRealPath()方法,于是上网搜索,找到两种方法来获取ServletContext. 方法1:第一种方法是这样的: Ser ...
[UWP]了解模板化控件(10)：原则与技巧
1. 原则推荐以符合以下原则的方式编写模板化控件: 选择合适的父类:选择合适的父类可以节省大量的工作,从UWP自带的控件中选择父类是最安全的做法,通常的选择是Control.ContentContr ...

Python并发实践_03_并发实战之一

Python并发实践_03_并发实战之一的更多相关文章

随机推荐

热门专题