import xlrd import os import xlwt from Bio import pairwise2 from docx import Document from docx.shared import RGBColor global COLOR_WHEEL global PREFIX_ATG global PREFIX_NOT_ATG global SUFFIX def main(): print 'extracting information' #extract information from files on expected and ovserved sequences expected_spreadsheet_path = \ input('Path to spreadsheet with expeted sequences: ') observed_folder_path = \ input('Path to folder with text files containing observed sequences: ') book = xlrd.open_workbook(expected_spreadsheet_path) sheet = book.sheet_by_index(0) numrows = sheet.nrows book_name = expected_spreadsheet_path.split('/')[-1].split('.')[0] #Expected_Sequences holds rows from input spreadsheet Expected_Sequences = [] for row in range(1,numrows): row_array = sheet.row_values(row) Expected_Sequences.append(Expect_Sequence(row_array)) #Observed_Sequences associates the contents of files with observed #sequences with the name of the file Observed_Sequences = {} for observed_file in os.listdir(observed_folder_path): observed_file_path = observed_folder_path + '/' + observed_file f = open(observed_file_path, 'r') observed_sequence = ''.join(f.read().split('\n')) Observed_Sequences[observed_file] = observed_sequence.upper() f.close() print 'comparing values' output_path = observed_folder_path.split('/')[-1] + ' Results' if not os.path.exists(output_path): os.makedirs(output_path) book = xlwt.Workbook() sh = book.add_sheet('Results') row_index = 0 #For each expected sequence, compares with observed sequence for Expected_Sequence in Expected_Sequences: try: #initialize document document = Document() document.add_heading(Expected_Sequence.Name, level = 0) document.add_heading('Sequence File:', level = 1) document.add_paragraph(Expected_Sequence.Observed_File) document.add_heading('Backbone:', level = 1) document.add_paragraph(Expected_Sequence.Backbone) document.add_heading('Inserts:', level = 1) sh.write(row_index, 0, Expected_Sequence.Name) i = 1 for insert in Expected_Sequence.Expected_Inserts: document.add_paragraph('Insert ' + str(i) + ':') document.add_paragraph(insert.Name) p = document.add_paragraph('') run = p.add_run(insert.Sequence) font = run.font font.color.rgb = insert.Color i += 1 document.add_heading('Observed Sequence', level = 1) paragraph = document.add_paragraph('') print ('comparing ' + Expected_Sequence.Name) Observed_Sequence = Observed_Sequences[Expected_Sequence.Observed_File] Sequence_Perfect = False Prefix_Perfect = True Suffix_Perfect = True Insert_Perfect = True Insert_Present = False Prefix_Present = False Suffix_Present = False Prefix_Adjacent = False Suffix_Adjacent = False Short_Sequence = False Bad_Sequence = (set(Observed_Sequence) == set(['N'])) Sequence_Check = '' pref_point_mut = 0 pref_ins_mut = 0 pref_del_mut = 0 ins_point_mut = 0 ins_ins_mut = 0 ins_del_mut = 0 ins_sec_miss = False suf_point_mut = 0 suf_ins_mut = 0 suf_del_mut = 0 base_index = 0 #reverses sequence if reversed if Expected_Sequence.Direction[0] == "R": Observed_Sequence = Rev_Comp(Observed_Sequence) #searches for prefix before expected sequence print 'finding prefix' if Expected_Sequence.Expected_Inserts[0].Sequence[:3] == "ATG": Prefix_Alignment = pairwise2.align.localms(\ PREFIX_ATG, Observed_Sequence, 1, -1, -3, -1) if Prefix_Alignment: Prefix_Alignment = Prefix_Alignment[0] if Prefix_Alignment[2] < 18: Prefix_Alignment = None else: Prefix_Alignment = pairwise2.align.localms(\ PREFIX_NOT_ATG, Observed_Sequence, 1, -1, -3, -1) if Prefix_Alignment: Prefix_Alignment = Prefix_Alignment[0] if Prefix_Alignment[2] < 18: Prefix_Alignment = None Last_Prefix_Base = None Last_Insert_Base = None if Prefix_Alignment: Prefix_Present = True Last_Prefix_Base = Prefix_Alignment[4] Before_Prefix = Observed_Sequence[:Prefix_Alignment[3]] paragraph.add_run(Before_Prefix) Sequence_Check = Sequence_Check + Before_Prefix base_index = Prefix_Alignment[3] for base in Prefix_Alignment[0][\ Prefix_Alignment[3]:Prefix_Alignment[4]]: if base_index < len(Observed_Sequence): if base == Prefix_Alignment[1][base_index]: run = paragraph.add_run(base) font = run.font font.color.rgb = RGBColor(0,255,0) Sequence_Check = Sequence_Check + base else: if Prefix_Alignment[1][base_index] != '-': run = paragraph.add_run(Prefix_Alignment[1][base_index]) font = run.font if Prefix_Alignment[0][base_index] == '-': font.color.rgb = RGBColor(222,0,20) pref_ins_mut += 1 else: pref_point_mut += 1 Sequence_Check = \ Sequence_Check + Prefix_Alignment[1][base_index] else: paragraph.add_run('_') pref_del_mut += 1 base_index += 1 base_index -= (len(Prefix_Alignment[1]) - len(Observed_Sequence)) print 'aligning inserts' #searches individually for parts of expected sequence #in order given in input spreadsheet for Expect_Insert in Expected_Sequence.Expected_Inserts: Insert = Expect_Insert.Sequence if len(Observed_Sequence) < len(Insert): Short_Sequence = True add_to_last_frag = '' for y in range(0,len(Insert)//80+1): if y == len(Insert)//80: if len(Insert) % 80 < 30: fragment = Insert[y*80:y*80 + 61] add_to_last_frag = Insert[y*80 + 61:(y+1)*80] else: fragment = Insert[y*80:(y+1)*80] if y == len(Insert)//80 + 1: fragment = add_to_last_frag + Insert[y*80:] else: fragment = Insert[y*80:(y+1)*80] frag_alignment = pairwise2.align.localms(\ fragment, Observed_Sequence, 1, -1, -3, -1) if frag_alignment: frag_alignment = frag_alignment[0] if frag_alignment[2] < (len(fragment) * .6) and \ frag_alignment[2] < 40: frag_alignment = None if frag_alignment: Insert_Present = True if y == 0 and Last_Prefix_Base: if ( frag_alignment[3] - Last_Prefix_Base <= 5 ): Prefix_Adjacent = True if y == len(Insert)//80: Last_Insert_Base = frag_alignment[4] if frag_alignment[3] >= base_index: Before_Insert = Observed_Sequence[base_index:frag_alignment[3]] paragraph.add_run(Before_Insert) Sequence_Check = Sequence_Check + Before_Insert base_index = frag_alignment[3] for base in frag_alignment[0][frag_alignment[3]:frag_alignment[4]]: if base_index < len(Observed_Sequence): if base == frag_alignment[1][base_index]: run = paragraph.add_run(base) font = run.font font.color.rgb = Expect_Insert.Color Sequence_Check = \ Sequence_Check + frag_alignment[1][base_index] else: if frag_alignment[1][base_index] != '-': run = paragraph.add_run(frag_alignment[1][base_index]) font = run.font if frag_alignment[0][base_index] == '-': font.color.rgb = RGBColor(222,0,20) ins_ins_mut += 1 else: ins_point_mut += 1 Sequence_Check = \ Sequence_Check + frag_alignment[1][base_index] else: paragraph.add_run('_') pref_del_mut += 1 base_index += 1 base_index -= (len(frag_alignment[1]) - len(Observed_Sequence)) elif frag_alignment[3] < base_index < frag_alignment[4]: for base in frag_alignment[0][base_index:frag_alignment[4]]: if base_index < len(Observed_Sequence): if base == frag_alignment[1][base_index]: run = paragraph.add_run(base) font = run.font font.color.rgb = Expect_Insert.Color Sequence_Check = \ Sequence_Check + frag_alignment[1][base_index] else: if frag_alignment[1][base_index] != '-': run = paragraph.add_run(frag_alignment[1][base_index]) font = run.font if frag_alignment[0][base_index] == '-': font.color.rgb = RGBColor(222,0,20) ins_ins_mut += 1 else: ins_point_mut += 1 Sequence_Check = \ Sequence_Check + frag_alignment[1][base_index] else: paragraph.add_run('_') ins_del_mut += 1 base_index += 1 base_index -= (len(frag_alignment[1]) - len(Observed_Sequence)) if frag_alignment[4] >= len(Observed_Sequence): base_index += (len(frag_alignment[1]) - len(Observed_Sequence)) break else: ins_sec_miss = True print 'finding suffix' #searches for suffix after last section of expected sequence if Short_Sequence: Suffix_Alignment = None else: Suffix_Alignment = pairwise2.align.localms(\ SUFFIX, Observed_Sequence, 1, -1, -3, -1) if Suffix_Alignment: Suffix_Alignment = Suffix_Alignment[0] if Suffix_Alignment[2] < 18: Suffix_Alignment = None if Suffix_Alignment: Suffix_Present = True if Last_Insert_Base: if ((Suffix_Alignment[3] - Last_Insert_Base) <= 5): Suffix_Adjacent = True if Suffix_Alignment[3] >= base_index: Before_Suffix = Observed_Sequence[base_index:Suffix_Alignment[3]] paragraph.add_run(Before_Suffix) Sequence_Check = Sequence_Check + Before_Suffix base_index = Suffix_Alignment[3] for base in Suffix_Alignment[0][\ Suffix_Alignment[3]:Suffix_Alignment[4]]: if base_index < len(Observed_Sequence): if base == Suffix_Alignment[1][base_index]: run = paragraph.add_run(base) font = run.font font.color.rgb = RGBColor(0,255,0) Sequence_Check = \ Sequence_Check + Suffix_Alignment[1][base_index] else: if Suffix_Alignment[1][base_index] != '-': run = paragraph.add_run(Suffix_Alignment[1][base_index]) font = run.font if Suffix_Alignment[0][base_index] == '-': font.color.rgb = RGBColor(222,0,20) suf_ins_mut += 1 else: suf_point_mut += 1 Sequence_Check = \ Sequence_Check + Suffix_Alignment[1][base_index] else: paragraph.add_run('_') suf_del_mut += 1 base_index += 1 base_index -= (len(Suffix_Alignment[1]) - len(Observed_Sequence)) elif Suffix_Alignment[3] < base_index < Suffix_Alignment[4]: for base in Suffix_Alignment[0][\ base_index:Suffix_Alignment[4]]: if base_index < len(Observed_Sequence): if base == Suffix_Alignment[1][base_index]: run = paragraph.add_run(base) font = run.font font.color.rgb = RGBColor(0,255,0) Sequence_Check = \ Sequence_Check + Suffix_Alignment[1][base_index] else: if Suffix_Alignment[1][base_index] != '-': run = paragraph.add_run(Suffix_Alignment[1][base_index]) font = run.font if Suffix_Alignment[0][base_index] == '-': font.color.rgb = RGBColor(222,0,20) suf_ins_mut += 1 else: suf_point_mut += 1 Sequence_Check = \ Sequence_Check + Suffix_Alignment[1][base_index] else: paragraph.add_run('_') suf_del_mut += 1 suf_alter_score += 1 base_index += 1 base_index -= (len(Suffix_Alignment[1]) - len(Observed_Sequence)) paragraph.add_run(Observed_Sequence[base_index:]) #keeps track of number and types of mutations #puts info in summary spreadsheet if pref_del_mut + pref_point_mut + pref_ins_mut != 0: Prefix_Perfect = False if ins_del_mut + ins_point_mut + ins_ins_mut != 0: Insert_Perfect = False if suf_del_mut + suf_point_mut + suf_ins_mut != 0: Suffix_Perfect = False if Prefix_Perfect and Insert_Perfect and Suffix_Perfect and \ Prefix_Adjacent and Suffix_Adjacent: Sequence_Perfect = True if Sequence_Perfect: sh.write(row_index,1,'sequence perfect') elif Bad_Sequence: sh.write(row_index,1,'bad sequence') else: result = '' if not Prefix_Present: result = result + 'prefix not found, ' if Prefix_Present and not Prefix_Adjacent and Insert_Present: result = result + 'prefix not adjacent to insert, ' if Prefix_Present and not Prefix_Perfect: pref_report = [] if pref_point_mut > 0: pref_report.append(str(pref_point_mut) + ' point mutations, ') if pref_ins_mut > 0: pref_report.append(str(pref_ins_mut) + ' insertions, ') if pref_del_mut > 0: pref_report.append(str(pref_del_mut) + ' deletions, ') result = result + 'prefix altered (' + ''.join(pref_report)[:-2] + '), ' if not Suffix_Present: result = result + 'suffix not found, ' if Suffix_Present and not Suffix_Adjacent and Insert_Present: result = result + 'suffix not adjacent to insert, ' if Suffix_Present and not Suffix_Perfect: suf_report = [] if suf_point_mut > 0: suf_report.append(str(suf_point_mut) + ' point mutations, ') if suf_ins_mut > 0: suf_report.append(str(suf_ins_mut) + ' insertions, ') if suf_del_mut > 0: suf_report.append(str(suf_del_mut) + ' deletions, ') result = result + 'insert altered (' + ''.join(suf_report)[:-2] + '), ' if not Insert_Present: result = result + 'insert not found, ' if Insert_Present and not Insert_Perfect: if ins_sec_miss: result = result + 'could not find section of insert, ' else: ins_report = [] if ins_point_mut > 0: ins_report.append(str(ins_point_mut) + ' point mutations, ') if ins_ins_mut > 0: ins_report.append(str(ins_ins_mut) + ' insertions, ') if ins_del_mut > 0: ins_report.append(str(ins_del_mut) + ' deletions, ') result = result + 'insert altered (' + ''.join(ins_report)[:-2] + '), ' if Short_Sequence: result = 'sequence short, ' + result if result: result = result[:-2] sh.write(row_index,1,result) else: sh.write(row_index,1,'anomalous result') row_index += 1 Sequence_Check = Sequence_Check + Observed_Sequence[base_index:] if Sequence_Check == Observed_Sequence: print "printed sequence correct" else: print "printed sequence altered. check against original sequence" print Sequence_Check document.save(output_path + '/' + Expected_Sequence.Name + '.docx') book.save(output_path + '/' + book_name + ' result summary.xlsx') except: print 'Problem Encountered. Skipping to Next Sequence.' class Expect_Sequence: """Holds information from row in input spreadsheet""" def __init__(self, row_values): self.Name = row_values[0] self.Observed_File = str(row_values[1]) self.Direction = str(row_values[2]).strip(' ').upper() self.Backbone = row_values[3] self.Expected_Inserts = row_values[4:] self.Colors = Color_Wheel() for i in range(len(self.Expected_Inserts)): self.Expected_Inserts[i] = ''.join(str(self.Expected_Inserts[i]).split('\n')).upper() self.Expected_Inserts = [x for x in self.Expected_Inserts if x != ''] temp = [] insert_names = self.Expected_Inserts[::2] insert_sequences = self.Expected_Inserts[1::2] for i in range(len(insert_names)): temp.append(Expected_Insert(insert_names[i],insert_sequences[i],self.Colors.Color())) self.Expected_Inserts = temp class Expected_Insert: """Holds part of expected sequence""" def __init__(self, Name, Sequence, Color): self.Name = Name self.Sequence = Sequence self.Color = Color class Color_Wheel: """Holds and returns color assignments for Word""" def __init__(self): self.Wheel = COLOR_WHEEL self.Pos = -1 def Color(self): self.Pos += 1 return self.Wheel[self.Pos%6] #reverse complement def Rev_Comp(seq): seq = seq.upper() rev_comp = '' for base in seq[::-1]: if base == 'A': base_comp = 'T' elif base == 'C': base_comp = 'G' elif base == 'T': base_comp = 'A' elif base == 'G': base_comp = 'C' else: base_comp = base rev_comp = rev_comp + base_comp return rev_comp yellow = RGBColor(255,225,45) pink = RGBColor(235,50,110) sky_blue = RGBColor(5,115,170) ochre = RGBColor(250,155,55) pale_orange = RGBColor(255,185,135) magenta = RGBColor(160,50,115) COLOR_WHEEL = [yellow,pink,sky_blue,ochre,pale_orange,magenta] PREFIX_ATG = 'GAATTCGCGGCCGCTTCTAG' PREFIX_NOT_ATG = 'GAATTCGCGGCCGCTTCTAGAG' SUFFIX = 'TACTAGTAGCGGCCGCTGCAG' main()