Python并发实践_03_并发实战之一
16S数据质控流程,一次下机lane包括很多的项目,每个项目有独立的合同号,一个项目可能包含16S或者ITS两种,通过一个完整的pipeline,将上游拆分好的数据全部整理成可以直接分析的数据。原本这个工作是通过并行的sge实现,是运行层面的并行,现在在程序层面实现并行处理,可以脱离sge系统工作。
import os
import sys
import re
import time
import collections
from multiprocessing import Process,JoinableQueue,Queue,cpu_count
from threading import Thread
from settings import primer,pandaseq_soft
from programs import * Result = collections.namedtuple("Result","compact sample_name HQ_fq") def parse_sam_barcode_file(sam_barcode_file):
for line in open(sam_barcode_file):
yield line.strip().split('\t') def proc(compact,sample_name,work_path,lib_method,data_type):
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
compact_path = '%s/%s'%(QC_path,compact)
if not os.path.exists(compact_path):
os.makedirs(compact_path)
sample_path = '%s/%s'%(compact_path,sample_name)
if not os.path.exists(sample_path):
os.makedirs(sample_path)
original_path = '%s/%s/%s'%(split_path,compact,sample_name)
(read1,read2) = os.popen('ls %s/*'%original_path).read().strip().split('\n')
pandaseq_fq = '%s/pandaseq.fq'%sample_path
pandaseq_log = '%s/pandaseq.log'%sample_path
pandaseq(pandaseq_soft,read1,read2,pandaseq_fq,primer[lib_method][data_type]['forward'],primer[lib_method][data_type]['reverse'],pandaseq_log)
high_quality_fq = '%s/high_quality.fq'%sample_path
high_quality_log = '%s/high_quality.stat'%sample_path
QC(pandaseq_fq,high_quality_fq,high_quality_log,data_type)
return Result(compact,sample_name,high_quality_fq) def worker(work_path,jobs,results):
while True:
try:
compact,sample_name,lib_method,data_type = jobs.get()
try:
result = proc(compact,sample_name,work_path,lib_method,data_type)
sys.stderr.write( 'Process %s is finished doing with compact:%s sample_name:%s\n'%(os.getpid(),compact,sample_name) )
results.put(result)
except:
sys.stderr.write('Process %s is FIALED !!! %s/%s may be some problem!\n'%(os.getpid(),compact,sample_name))
jobs.put((compact,sample_name,lib_method,data_type))
sys.stderr.write('The job is repushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
finally:
jobs.task_done() def add_jobs(work_path,sam_barcode_file_list,jobs):
job_num = 0
data_type_hash = {}
for todo,sam_barcode_file in enumerate(sam_barcode_file_list):
sam_barcode_file = sam_barcode_file.strip()
if not os.path.isfile(sam_barcode_file):
continue
lib_method = get_lib_method(sam_barcode_file)
if lib_method is None:
continue
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
if not data_type_hash.has_key(compact):
data_type_hash[compact] = {}
if not data_type_hash[compact].has_key(data_type):
data_type_hash[compact][data_type] = []
data_type_hash[compact][data_type].append(sample_name)
jobs.put((compact,sample_name,lib_method,data_type))
job_num += 1
sys.stderr.write('The job is pushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
sys.stderr.write('\n### All %s jobs have been pushed into the queue ###\n'%job_num)
return data_type_hash def create_processes(concurrency,jobs,work_path,results):
print '\nBegin create jobs with %s Process...\n'%concurrency
for _ in range(concurrency):
process = Process(target=worker,args=(work_path,jobs,results))
process.daemon = True
process.start() def main(work_path,sam_barcode_file_list):
global concurrency
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
jobs = JoinableQueue()
results = Queue() canceled = False
data_type_hash = add_jobs(split_path,sam_barcode_file_list,jobs)
create_processes(concurrency,jobs,work_path,results)
try:
jobs.join()
except KeyboardInterrupt:
sys.stderr.write('cancelling ...\n')
canceled = True
finally:
job_num = 0
finished_hash = {}
while not results.empty():
result = results.get_nowait()
job_num += 1
if not finished_hash.has_key(result.compact):
finished_hash[result.compact] = []
finished_hash[result.compact].append(result.sample_name)
sys.stderr.write('all %s work finished!\n\n'%job_num)
log_out = open('%s/work.log'%QC_path,'w')
for compact,sample_list in finished_hash.iteritems():
for sample_name in sample_list:
log_out.write('%s\t%s has been finished\n'%(compact,sample_name))
log_out.close()
if canceled:
return False for compact in os.listdir(QC_path):
compact_dir = '%s/%s'%(QC_path,compact)
if not os.path.isdir(compact_dir):
continue
sys.stderr.write('Begin stat compact: %s\n'%compact)
reads_stat(compact_dir)
sys.stderr.write('All campact stat finished!\n\n') reads_stat_all(QC_path,split_path) merge_threads = set()
for compact,subitem in data_type_hash.iteritems():
compact_dir = '%s/%s'%(QC_path,compact)
for data_type,sample_list in subitem.iteritems():
merged_file = '%s/%s/%s.together.fna'%(QC_path,compact,data_type)
t = Thread(target=sampleMerge,args=(sample_list,data_type,compact_dir,merged_file))
merge_threads.add(t)
t.start()
while True:
if threading.activeCount() < concurrency:
break
for t in threading.enumerate():
if t in merge_threads:
t.join() sys.stderr.write('\n All pipeline is done ! \n') if __name__ == '__main__':
sys.argv.pop(0)
if len(sys.argv) < 1:
sys.stderr.write('Usage: python run_pipeline.py work_path [process_num] \n process_num default is cpu_count\n')
sys.exit()
work_path = sys.argv.pop(0)
work_path = os.path.abspath(work_path)
sys.stderr.write('Workdir is %s,pipeline begin\n'%work_path)
sam_barcode_file_list = os.popen('ls %s/Split/sam_barcode.*'%work_path).read().strip().split('\n')
if len(sys.argv) != 0:
concurrency = int(sys.argv.pop(0))
else:
concurrency = cpu_count() main(work_path,sam_barcode_file_list)
下面是一些辅助程序:
from __future__ import division
from threading import Thread,Lock
from multiprocessing import cpu_count
import threading
import sys
import os
import re
import types
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def fq_reads_num(fq_file):
wc_out = os.popen('wc -l %s'%fq_file).read().strip()
result = int(re.search('^(\d+)',wc_out).group(1)) / 4
return int(result) def Q_ave(self):
Q_sum = 0
for qlist in self.letter_annotations.itervalues():
for q in qlist:
Q_sum += q
Q_ave = Q_sum / len(self)
return Q_ave def QC(file,out_file,out_stat_file,data_type):
SeqRecord.Q_ave = Q_ave
out_stat = open(out_stat_file,'w')
out = open(out_file,'w') count = 0
high_count = 0
for record in SeqIO.parse(open(file),'fastq'):
count += 1
if record.Q_ave() < 20:
continue
if len(record) < 220 or len(record) > 500:
continue
out.write(record.format('fastq'))
high_count += 1
high_ratio = high_count / count
out_stat.write('%s\t%s\t%s\t%s\n'%(data_type,count,high_count,high_ratio)) class MyList(list):
def __str__(self):
out_str = ''
for item in self:
out_str += item
out_str += '\t'
return out_str.strip() def parse_stat(stat_file):
tabs = os.popen('cat %s'%stat_file).read().strip().split('\t')
yield tabs def parse_stat_files(compact_path):
for f in os.popen('ls %s/*/*.stat'%compact_path):
stat_file = f.strip()
sample_name = re.search('%s\/(\S+)\/high_quality\.stat'%compact_path,stat_file).group(1)
yield stat_file,sample_name def reads_stat(compact_path):
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
sample_reads[sample_name] = tabs out.write('sample_name\tsample_type\traw_reads\tHQ_reads\tHQ_ratio\n')
for sample,tabs in sample_reads.iteritems():
tabs = MyList(tabs)
out.write('%s\t%s\n'%(sample,str(tabs)))
out.close() def raw_stat_thread(fq_file,lock,compact,sample_name,tabs,out):
global total_reads
# sys.stderr.write('thread %s stat with %s %s\n'%(threading.currentThread().ident,compact,sample_name))
raw_reads = fq_reads_num(fq_file)
lock.acquire()
total_reads += raw_reads
data_type = tabs.pop(0)
ratio = int(tabs[1]) / raw_reads * 100
tabs = str(MyList(tabs))
out.write('%s\t%s\t%s\t%s\t%s\t%2.2f%%\n'%(compact,sample_name,data_type,raw_reads,tabs,ratio))
lock.release()
# sys.stderr.write('thread %s finished doing with %s %s\n'%(threading.currentThread().ident,compact,sample_name)) total_reads = 0
def reads_stat_all(work_path,original_path):
global total_reads
sys.stderr.write('\nmerge stat is begin ... \n')
out = open('%s/reads_stat.xls'%work_path,'w')
compact_hash = {}
for f in os.listdir(work_path):
compact = f.strip()
compact_path = '%s/%s'%(work_path,compact)
if not os.path.isdir(compact_path):
continue
if not compact_hash.has_key(compact):
compact_hash[compact] = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
compact_hash[compact][sample_name] = tabs
out.write('compact\tsample_name\tdata_type\traw_reads\tpandaseq_reads\tHQ_reads\tratio\n')
lock = Lock()
active_threads = set()
for compact,sample in compact_hash.iteritems():
sys.stderr.write('doing with %s stat\n'%compact)
for sample_name,tabs in sample.iteritems():
original_fq = os.popen('ls %s/%s/%s/*'%(original_path,compact,sample_name)).read().strip().split('\n').pop(0)
t = Thread(target=raw_stat_thread,args=(original_fq,lock,compact,sample_name,tabs,out))
active_threads.add(t)
t.start()
while True:
if threading.activeCount() < cpu_count():
break
out.flush()
for t in threading.enumerate():
if t in active_threads:
sys.stderr.write('thread %s is still alive, wait ...\n'%t.ident)
t.join()
sys.stderr.write('Unaligned stating ...\n')
out.write('\n###\n')
unalign_fq = os.popen('ls %s/Unalign/*'%original_path).read().strip().split('\n').pop(0)
unalign_reads = fq_reads_num(unalign_fq)
total_reads += unalign_reads
ratio = unalign_reads / total_reads * 100
out.write('Unalign\t%s\t%2.2f%%\n'%(unalign_reads,ratio))
out.close()
sys.stderr.write('merge stat is all finished!\n\n') def pandaseq(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out):
cmd = '%s -F -f %s -r %s -w %s -p %s -q %s -g %s -l 220 -L 500'%(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out)
os.system(cmd) def sampleMerge(sample_list,data_type,file_path,outfile):
outhandle = open(outfile,'w')
# sys.stderr.write('Begin merge into %s\n'%file_path)
reads_num = {}
f_template = '%s/%s/high_quality.fq'
for sample in sample_list:
f = f_template%(file_path,sample)
sample = re.sub('[-_]','.',sample)
sample = '%s%s'%(data_type,sample)
if not reads_num.has_key(sample):
reads_num[sample] = 0
for record in SeqIO.parse(open(f),'fastq'):
reads_num[sample] += 1
outhandle.write('>%s_%s\n%s\n'%(sample,reads_num[sample],str(record.seq)))
outhandle.close()
sys.stderr.write('merge file: %s is finished\n'%outfile) def get_lib_method(file):
file = os.path.basename(file)
if re.match('^sam_barcode.l$',file):
lib_method = 'Self'
elif re.match('^sam_barcode.s\d+$',file):
lib_method = 'HXT'
else:
lib_method = None
return lib_method
settings.py中包含不同建库方式的引物序列。
这个程序也算是前几天的学习成果展示了
Python并发实践_03_并发实战之一的更多相关文章
- Python机器学习实践与Kaggle实战(转)
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5 ...
- Python并发实践_01_线程与进程初探
进程与线程 在多任务处理中,每一个任务都有自己的进程,一个任务会有很多子任务,这些在进程中开启线程来执行这些子任务.一般来说,可以将独立调度.分配的基本单元作为线程运行,而进程是资源拥有的基本单位. ...
- Python 多线程教程:并发与并行
转载于: https://my.oschina.net/leejun2005/blog/398826 在批评Python的讨论中,常常说起Python多线程是多么的难用.还有人对 global int ...
- Golang 高效实践之并发实践context篇
前言 在上篇Golang高效实践之并发实践channel篇中我给大家介绍了Golang并发模型,详细的介绍了channel的用法,和用select管理channel.比如说我们可以用channel来控 ...
- Appium+python自动化(三十七)- 士兵突击许三多 - 多个appium服务启动,多个设备启动,多进程并发启动设备-并发测试 - 下(超详解)
简介 接着上一篇继续看一下如何并发测试以及并发测试的过程中,可能遇到的问题,在这里宏哥把宏哥遇到的和小伙伴或者童鞋们,一起分享一下. Appium端口检测 问题思考 经过前面学习,我们已经能够使用py ...
- python之socketserver实现并发
python之socketserver实现并发 服务端 import socketserver #socketserver模块是用来实现并发 # 我们自己的类里一定要继承socketserver.Ba ...
- python多进程并发和多线程并发和协程
为什么需要并发编程? 如果程序中包含I/O操作,程序会有很高的延迟,CPU会处于等待状态,这样会浪费系统资源,浪费时间 1.Python的并发编程分为多进程并发和多线程并发 多进程并发:运行多个独立的 ...
- python 使用多进程实现并发编程/使用queue进行进程间数据交换
import time import os import multiprocessing from multiprocessing import Queue, pool ""&qu ...
- 【Scala】Scala多线程-并发实践
Scala多线程-并发实践 scala extends Thread_百度搜索 scala多线程 - 且穷且独立 - 博客园 Scala和并发编程 - Andy Tech Talk - ITeye博客 ...
随机推荐
- CCF-201604-3-路径解析
问题描述 试题编号: 201604-3 试题名称: 路径解析 时间限制: 1.0s 内存限制: 256.0MB 问题描述: 问题描述 在操作系统中,数据通常以文件的形式存储在文件系统中.文件系统一般采 ...
- [安全]服务器安全之 PHP权限目录
1.为每个主机配置增加一个 fastcgi_param PHP_VALUE "open_basedir=$document_root:/tmp/"; 或是直接把这句话放到fa ...
- 你不得不了解的应用容器引擎---Docker
最近突然想搭一个redis集群玩玩,因为公司的电脑同时开2个虚拟机就卡的不行,所以我就想到用Docker开启多个redis-server来搭建.然后在网上找着找着发现,使用Docker,哪需要搭建啊, ...
- java.util.ArrayList、java.util.vector和java.util.LinkedList (JDK 1.8.0_111)
一.java.util.ArrayList 1.1 ArrayList 继承结构 ArrayList实现了RandomAccess,可以随机访问(其实就是通过数组下标访问):实现了Cloneable, ...
- Java IO(3)非阻塞式输入输出(NIO)
在上篇<Java IO(2)阻塞式输入输出(BIO)>的末尾谈到了什么是阻塞式输入输出,通过Socket编程对其有了大致了解.现在再重新回顾梳理一下,对于只有一个“客户端”和一个“服务器端 ...
- Android事件拦截机制简单分析
前一阶段,在学习的时候,遇到了我觉得的我接触安卓以来的最多的一次事件拦截出来,那个项目,用到了slidemenu側滑菜单条,然后加上tab标签,还有轮播广告,listview上下滑动.viewpage ...
- 经典面试题目——250M内存处理10G大小的log文件
前言 周末逛知乎的时候,看到的一个经典面试题目:http://www.zhihu.com/question/26435483.非常经典的一道分而治之的题目. 题目描写叙述例如以下: 有次面试遇到一个问 ...
- Hadoop:Rack Awareness
副本的放置对HDFS可靠性和性能至关重要. 优化副本放置HDFS有别于其他大多数分布式文件系统. 这是一个功能,需要大量的调优和经验. 基于机架感知(rack awareness)的副本放置策略的目的 ...
- Caused by: java.net.SocketException: Broken pipe
异常信息 时间:2017-03-24 17:22:16,719 - 级别:[ WARN] - 消息: [other] The web application [ROOT] appears to hav ...
- Struts2学习---result结果集
这一章节主要介绍如何配置结果集,分为以下几个知识点: 结果集类型(result type) 全局结果集(global types) 动态结果集(dynamic type) 带有参数的结果集(type ...