16S数据质控流程,一次下机lane包括很多的项目,每个项目有独立的合同号,一个项目可能包含16S或者ITS两种,通过一个完整的pipeline,将上游拆分好的数据全部整理成可以直接分析的数据。原本这个工作是通过并行的sge实现,是运行层面的并行,现在在程序层面实现并行处理,可以脱离sge系统工作。

 import os
import sys
import re
import time
import collections
from multiprocessing import Process,JoinableQueue,Queue,cpu_count
from threading import Thread
from settings import primer,pandaseq_soft
from programs import * Result = collections.namedtuple("Result","compact sample_name HQ_fq") def parse_sam_barcode_file(sam_barcode_file):
for line in open(sam_barcode_file):
yield line.strip().split('\t') def proc(compact,sample_name,work_path,lib_method,data_type):
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
compact_path = '%s/%s'%(QC_path,compact)
if not os.path.exists(compact_path):
os.makedirs(compact_path)
sample_path = '%s/%s'%(compact_path,sample_name)
if not os.path.exists(sample_path):
os.makedirs(sample_path)
original_path = '%s/%s/%s'%(split_path,compact,sample_name)
(read1,read2) = os.popen('ls %s/*'%original_path).read().strip().split('\n')
pandaseq_fq = '%s/pandaseq.fq'%sample_path
pandaseq_log = '%s/pandaseq.log'%sample_path
pandaseq(pandaseq_soft,read1,read2,pandaseq_fq,primer[lib_method][data_type]['forward'],primer[lib_method][data_type]['reverse'],pandaseq_log)
high_quality_fq = '%s/high_quality.fq'%sample_path
high_quality_log = '%s/high_quality.stat'%sample_path
QC(pandaseq_fq,high_quality_fq,high_quality_log,data_type)
return Result(compact,sample_name,high_quality_fq) def worker(work_path,jobs,results):
while True:
try:
compact,sample_name,lib_method,data_type = jobs.get()
try:
result = proc(compact,sample_name,work_path,lib_method,data_type)
sys.stderr.write( 'Process %s is finished doing with compact:%s sample_name:%s\n'%(os.getpid(),compact,sample_name) )
results.put(result)
except:
sys.stderr.write('Process %s is FIALED !!! %s/%s may be some problem!\n'%(os.getpid(),compact,sample_name))
jobs.put((compact,sample_name,lib_method,data_type))
sys.stderr.write('The job is repushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
finally:
jobs.task_done() def add_jobs(work_path,sam_barcode_file_list,jobs):
job_num = 0
data_type_hash = {}
for todo,sam_barcode_file in enumerate(sam_barcode_file_list):
sam_barcode_file = sam_barcode_file.strip()
if not os.path.isfile(sam_barcode_file):
continue
lib_method = get_lib_method(sam_barcode_file)
if lib_method is None:
continue
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
if not data_type_hash.has_key(compact):
data_type_hash[compact] = {}
if not data_type_hash[compact].has_key(data_type):
data_type_hash[compact][data_type] = []
data_type_hash[compact][data_type].append(sample_name)
jobs.put((compact,sample_name,lib_method,data_type))
job_num += 1
sys.stderr.write('The job is pushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
sys.stderr.write('\n### All %s jobs have been pushed into the queue ###\n'%job_num)
return data_type_hash def create_processes(concurrency,jobs,work_path,results):
print '\nBegin create jobs with %s Process...\n'%concurrency
for _ in range(concurrency):
process = Process(target=worker,args=(work_path,jobs,results))
process.daemon = True
process.start() def main(work_path,sam_barcode_file_list):
global concurrency
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
jobs = JoinableQueue()
results = Queue() canceled = False
data_type_hash = add_jobs(split_path,sam_barcode_file_list,jobs)
create_processes(concurrency,jobs,work_path,results)
try:
jobs.join()
except KeyboardInterrupt:
sys.stderr.write('cancelling ...\n')
canceled = True
finally:
job_num = 0
finished_hash = {}
while not results.empty():
result = results.get_nowait()
job_num += 1
if not finished_hash.has_key(result.compact):
finished_hash[result.compact] = []
finished_hash[result.compact].append(result.sample_name)
sys.stderr.write('all %s work finished!\n\n'%job_num)
log_out = open('%s/work.log'%QC_path,'w')
for compact,sample_list in finished_hash.iteritems():
for sample_name in sample_list:
log_out.write('%s\t%s has been finished\n'%(compact,sample_name))
log_out.close()
if canceled:
return False for compact in os.listdir(QC_path):
compact_dir = '%s/%s'%(QC_path,compact)
if not os.path.isdir(compact_dir):
continue
sys.stderr.write('Begin stat compact: %s\n'%compact)
reads_stat(compact_dir)
sys.stderr.write('All campact stat finished!\n\n') reads_stat_all(QC_path,split_path) merge_threads = set()
for compact,subitem in data_type_hash.iteritems():
compact_dir = '%s/%s'%(QC_path,compact)
for data_type,sample_list in subitem.iteritems():
merged_file = '%s/%s/%s.together.fna'%(QC_path,compact,data_type)
t = Thread(target=sampleMerge,args=(sample_list,data_type,compact_dir,merged_file))
merge_threads.add(t)
t.start()
while True:
if threading.activeCount() < concurrency:
break
for t in threading.enumerate():
if t in merge_threads:
t.join() sys.stderr.write('\n All pipeline is done ! \n') if __name__ == '__main__':
sys.argv.pop(0)
if len(sys.argv) < 1:
sys.stderr.write('Usage: python run_pipeline.py work_path [process_num] \n process_num default is cpu_count\n')
sys.exit()
work_path = sys.argv.pop(0)
work_path = os.path.abspath(work_path)
sys.stderr.write('Workdir is %s,pipeline begin\n'%work_path)
sam_barcode_file_list = os.popen('ls %s/Split/sam_barcode.*'%work_path).read().strip().split('\n')
if len(sys.argv) != 0:
concurrency = int(sys.argv.pop(0))
else:
concurrency = cpu_count() main(work_path,sam_barcode_file_list)

下面是一些辅助程序:

 from __future__ import division
from threading import Thread,Lock
from multiprocessing import cpu_count
import threading
import sys
import os
import re
import types
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def fq_reads_num(fq_file):
wc_out = os.popen('wc -l %s'%fq_file).read().strip()
result = int(re.search('^(\d+)',wc_out).group(1)) / 4
return int(result) def Q_ave(self):
Q_sum = 0
for qlist in self.letter_annotations.itervalues():
for q in qlist:
Q_sum += q
Q_ave = Q_sum / len(self)
return Q_ave def QC(file,out_file,out_stat_file,data_type):
SeqRecord.Q_ave = Q_ave
out_stat = open(out_stat_file,'w')
out = open(out_file,'w') count = 0
high_count = 0
for record in SeqIO.parse(open(file),'fastq'):
count += 1
if record.Q_ave() < 20:
continue
if len(record) < 220 or len(record) > 500:
continue
out.write(record.format('fastq'))
high_count += 1
high_ratio = high_count / count
out_stat.write('%s\t%s\t%s\t%s\n'%(data_type,count,high_count,high_ratio)) class MyList(list):
def __str__(self):
out_str = ''
for item in self:
out_str += item
out_str += '\t'
return out_str.strip() def parse_stat(stat_file):
tabs = os.popen('cat %s'%stat_file).read().strip().split('\t')
yield tabs def parse_stat_files(compact_path):
for f in os.popen('ls %s/*/*.stat'%compact_path):
stat_file = f.strip()
sample_name = re.search('%s\/(\S+)\/high_quality\.stat'%compact_path,stat_file).group(1)
yield stat_file,sample_name def reads_stat(compact_path):
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
sample_reads[sample_name] = tabs out.write('sample_name\tsample_type\traw_reads\tHQ_reads\tHQ_ratio\n')
for sample,tabs in sample_reads.iteritems():
tabs = MyList(tabs)
out.write('%s\t%s\n'%(sample,str(tabs)))
out.close() def raw_stat_thread(fq_file,lock,compact,sample_name,tabs,out):
global total_reads
# sys.stderr.write('thread %s stat with %s %s\n'%(threading.currentThread().ident,compact,sample_name))
raw_reads = fq_reads_num(fq_file)
lock.acquire()
total_reads += raw_reads
data_type = tabs.pop(0)
ratio = int(tabs[1]) / raw_reads * 100
tabs = str(MyList(tabs))
out.write('%s\t%s\t%s\t%s\t%s\t%2.2f%%\n'%(compact,sample_name,data_type,raw_reads,tabs,ratio))
lock.release()
# sys.stderr.write('thread %s finished doing with %s %s\n'%(threading.currentThread().ident,compact,sample_name)) total_reads = 0
def reads_stat_all(work_path,original_path):
global total_reads
sys.stderr.write('\nmerge stat is begin ... \n')
out = open('%s/reads_stat.xls'%work_path,'w')
compact_hash = {}
for f in os.listdir(work_path):
compact = f.strip()
compact_path = '%s/%s'%(work_path,compact)
if not os.path.isdir(compact_path):
continue
if not compact_hash.has_key(compact):
compact_hash[compact] = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
compact_hash[compact][sample_name] = tabs
out.write('compact\tsample_name\tdata_type\traw_reads\tpandaseq_reads\tHQ_reads\tratio\n')
lock = Lock()
active_threads = set()
for compact,sample in compact_hash.iteritems():
sys.stderr.write('doing with %s stat\n'%compact)
for sample_name,tabs in sample.iteritems():
original_fq = os.popen('ls %s/%s/%s/*'%(original_path,compact,sample_name)).read().strip().split('\n').pop(0)
t = Thread(target=raw_stat_thread,args=(original_fq,lock,compact,sample_name,tabs,out))
active_threads.add(t)
t.start()
while True:
if threading.activeCount() < cpu_count():
break
out.flush()
for t in threading.enumerate():
if t in active_threads:
sys.stderr.write('thread %s is still alive, wait ...\n'%t.ident)
t.join()
sys.stderr.write('Unaligned stating ...\n')
out.write('\n###\n')
unalign_fq = os.popen('ls %s/Unalign/*'%original_path).read().strip().split('\n').pop(0)
unalign_reads = fq_reads_num(unalign_fq)
total_reads += unalign_reads
ratio = unalign_reads / total_reads * 100
out.write('Unalign\t%s\t%2.2f%%\n'%(unalign_reads,ratio))
out.close()
sys.stderr.write('merge stat is all finished!\n\n') def pandaseq(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out):
cmd = '%s -F -f %s -r %s -w %s -p %s -q %s -g %s -l 220 -L 500'%(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out)
os.system(cmd) def sampleMerge(sample_list,data_type,file_path,outfile):
outhandle = open(outfile,'w')
# sys.stderr.write('Begin merge into %s\n'%file_path)
reads_num = {}
f_template = '%s/%s/high_quality.fq'
for sample in sample_list:
f = f_template%(file_path,sample)
sample = re.sub('[-_]','.',sample)
sample = '%s%s'%(data_type,sample)
if not reads_num.has_key(sample):
reads_num[sample] = 0
for record in SeqIO.parse(open(f),'fastq'):
reads_num[sample] += 1
outhandle.write('>%s_%s\n%s\n'%(sample,reads_num[sample],str(record.seq)))
outhandle.close()
sys.stderr.write('merge file: %s is finished\n'%outfile) def get_lib_method(file):
file = os.path.basename(file)
if re.match('^sam_barcode.l$',file):
lib_method = 'Self'
elif re.match('^sam_barcode.s\d+$',file):
lib_method = 'HXT'
else:
lib_method = None
return lib_method

settings.py中包含不同建库方式的引物序列。

这个程序也算是前几天的学习成果展示了

Python并发实践_03_并发实战之一的更多相关文章

  1. Python机器学习实践与Kaggle实战(转)

    https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5 ...

  2. Python并发实践_01_线程与进程初探

    进程与线程 在多任务处理中,每一个任务都有自己的进程,一个任务会有很多子任务,这些在进程中开启线程来执行这些子任务.一般来说,可以将独立调度.分配的基本单元作为线程运行,而进程是资源拥有的基本单位. ...

  3. Python 多线程教程:并发与并行

    转载于: https://my.oschina.net/leejun2005/blog/398826 在批评Python的讨论中,常常说起Python多线程是多么的难用.还有人对 global int ...

  4. Golang 高效实践之并发实践context篇

    前言 在上篇Golang高效实践之并发实践channel篇中我给大家介绍了Golang并发模型,详细的介绍了channel的用法,和用select管理channel.比如说我们可以用channel来控 ...

  5. Appium+python自动化(三十七)- 士兵突击许三多 - 多个appium服务启动,多个设备启动,多进程并发启动设备-并发测试 - 下(超详解)

    简介 接着上一篇继续看一下如何并发测试以及并发测试的过程中,可能遇到的问题,在这里宏哥把宏哥遇到的和小伙伴或者童鞋们,一起分享一下. Appium端口检测 问题思考 经过前面学习,我们已经能够使用py ...

  6. python之socketserver实现并发

    python之socketserver实现并发 服务端 import socketserver #socketserver模块是用来实现并发 # 我们自己的类里一定要继承socketserver.Ba ...

  7. python多进程并发和多线程并发和协程

    为什么需要并发编程? 如果程序中包含I/O操作,程序会有很高的延迟,CPU会处于等待状态,这样会浪费系统资源,浪费时间 1.Python的并发编程分为多进程并发和多线程并发 多进程并发:运行多个独立的 ...

  8. python 使用多进程实现并发编程/使用queue进行进程间数据交换

    import time import os import multiprocessing from multiprocessing import Queue, pool ""&qu ...

  9. 【Scala】Scala多线程-并发实践

    Scala多线程-并发实践 scala extends Thread_百度搜索 scala多线程 - 且穷且独立 - 博客园 Scala和并发编程 - Andy Tech Talk - ITeye博客 ...

随机推荐

  1. [Docker基础]Docker安装教程

    Install Docker Docker支持几乎所有的Linux发行版,也支持Mac和Windows. 各操作系统的安装方法可参考Docker官网. 安装环境 ubuntu 16.04 Docker ...

  2. mybatis实现延迟加载多对一

    1.数据库表 CREATE TABLE `country` ( `cid` ) NOT NULL AUTO_INCREMENT COMMENT '国家id', `cname` ) COLLATE ut ...

  3. NOIP2017提高组初赛解析

    首发于订阅号 嗨编程,这是一个以嗨为目标的编程订阅号(仅仅是目标而已),扫码可关注,不定期更. 解析中引用了一张关于排序的总结课件图片,来源网络,如果侵权,请联系本人删除(没钱付版权费)

  4. 翻煎饼 Stacks of Flapjacks

    题意:本题意为煎饼排序,大的放在上面,小的放在下面(此题输入是从上到下输入的),为煎饼排序是通过一系列的"翻转"动作来完成的.翻转动作就是将一个小铲插到一叠煎饼中的某两个煎饼之间, ...

  5. 单源最短路径 dijkstra算法实现

    本文记录一下dijkstra算法的实现,图用邻接矩阵表示,假设图为无向图.而且连通,有向图,不连通图的做法相似. 算法简述: 首先确定"单源"的源.假设是第0个顶点. 维护三个数组 ...

  6. 002Java概述

    1Sun(Stanford University Network )公司1995年推出的高级编程语言 2.面向Internet的编程语言 3.已经成为web应用程序的首选开发语言 4.完全面向对象简单 ...

  7. Python笔记·第五章—— 列表(List) 的增删改查及其他方法

    一.列表的简介   列表是python中的基础数据类型之一,其他语言中也有类似于列表的数据类型,比如js中叫数组,他是以[ ]括起来,每个元素以逗号隔开,而且他里面可以存放各种数据类型比如:li = ...

  8. 使用Myeclipse为数据表创建hibernate实体对象

    hibernate是orm框架的一种,orm即Object Relational Mapping,对象映射关系,其主要作用是将数据库(mysql,mssql,oracle)的对象转换为具体编程语言(如 ...

  9. java类中获取ServletContext的方法

    起因是我想要获取一个相对路径,需要用到servletContext的getRealPath()方法,于是上网搜索,找到两种方法来获取ServletContext. 方法1:第一种方法是这样的: Ser ...

  10. [UWP]了解模板化控件(10):原则与技巧

    1. 原则 推荐以符合以下原则的方式编写模板化控件: 选择合适的父类:选择合适的父类可以节省大量的工作,从UWP自带的控件中选择父类是最安全的做法,通常的选择是Control.ContentContr ...