Source code for constrain.design.cloning

#!/usr/bin/env python
# MIT License
# Copyright (c) 2022, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.


""" Module used for cloning of microbial strains."""
import functools
import pandas as pd
import pydna
import Bio
import Bio.SeqFeature
import Bio.SeqRecord
import Bio.SeqIO
from Bio.Seq import Seq
from pydna.assembly import Assembly
from pydna.dseq import Dseq


[docs]def CAS9_cutting(gRNA_record, background_record): """Simulates double-stranded-break by CAS9 given a gRNA. Parameters ---------- gRNA_record: pydna.dseqrecord. A 20 bp DNA sequence background_record: pydna.dseqrecord. The sequence of interest for CRISPR mediated DSB Returns ------- 1.pydna.dseqrecord. Sequence upstream of the DSB: pydna.dseqrecord. 2. pydna.dseqrecord. Sequence downstream of the DSB: pydna.dseqrecord. """ gRNA_sequence = gRNA_record.seq.watson.upper() background_sequence = background_record.seq.upper() gRNA_strand = 1 if background_sequence.find(gRNA_sequence) == -1: gRNA_strand = -1 gRNA_sequence = Seq(gRNA_sequence) gRNA_sequence = (gRNA_sequence).reverse_complement() if background_sequence.find(gRNA_sequence) == -1: print("not on -1, CAN'T FIND THE CUT SITE IN YOUR SEQUENCE") if gRNA_strand == 1: cut_pos_rel_start = 17 else: cut_pos_rel_start = 3 gRNA_start = background_sequence.find(gRNA_sequence) cut_site = gRNA_start + cut_pos_rel_start up = background_record[0:cut_site] dw = background_record[cut_site : len(background_sequence)] up.name = "UP" + "_" + gRNA_record.name + "_" + background_record.name dw.name = "DW" + "_" + gRNA_record.name + "_" + background_record.name up_feature = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(0, len(up)), type="misc_feature", strand=+1 ) up_feature.qualifiers["label"] = up.name up.features.append(up_feature) dw_feature = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(0, len(dw)), type="misc_feature", strand=+1 ) dw_feature.qualifiers["label"] = dw.name dw.features.append(dw_feature) up = pydna.dseqrecord.Dseqrecord( up, ) dw = pydna.dseqrecord.Dseqrecord(dw) # UPS more than one cut site? if dw.seq.find(gRNA_sequence) != -1: print("OBS", gRNA_sequence, "cuts more than one location!") return (up, dw)
[docs]def extract_gRNAs(template, name): """Extracts gRNAs from a template. Parameters ---------- template: pydna.dseqrecord or pydna.amplicon.Amplicon a plasmid or piece of DNA name: str a string that would include the feature name for example: gRNA Returns ------- list of pydna.dseqrecord or pydna.amplicon.Amplicon list of with the found features and their sequences """ gRNAs = [] for feature in template.features: if name in feature.qualifiers.get("name", ""): gRNA = template[feature.location.start : feature.location.end] gRNA.name = feature.qualifiers.get("name", "") gRNAs.append(gRNA) return gRNAs
[docs]def remove_features_with_negative_loc(record): """Removes a SeqFeatures if negative. Parameters ---------- record: pydna.amplicon.Amplicon. A amplicon with SeqFeature and locations Returns ------- record: pydna.amplicon.Amplicon. With the negative features deleted """ for i in range(len(record.features)): if record.features[i].location.start < 0 or record.features[i].location.end < 0: del record.features[i]
[docs]def extract_template_amplification_sites(templates, names, terminator): """Extracts amplifications sites from a templates features Parameters ---------- templates: list of Bio.SeqRecord.SeqRecord list of Bio.SeqRecord.SeqRecord objects with SeqFeatures names: list of strings list of strings to be extracted terminator: str a string with the name of upstream terminator Returns ------- record: list of Bio.SeqRecord.SeqRecord list of extracted elements """ template_amplification_sites = [] for name, template in zip(names, templates): for feature in template.features: if feature.qualifiers["name"] == name: CDS_strand = feature.strand if CDS_strand == 1: start = feature.location.start print(start) else: end = feature.location.end print(end) if feature.qualifiers["name"].endswith(terminator): terminator_strand = feature.strand if terminator_strand == 1: end = feature.location.end else: start = feature.location.start print(start) template_amplification_site = template[start:end] if "batches" in template_amplification_site.annotations.keys(): template_amplification_site.annotations["batches"].append( template.annotations["batches"][0] ) # {'location': template.annotations['batches'][0]['location']} else: template_amplification_site.annotations["batches"] = [] template_amplification_site.annotations["batches"].append( template.annotations ) # {'location': template.annotations['batches'][0]['location']} template_amplification_sites.append(template_amplification_site) return template_amplification_sites
[docs]def extract_sites(annotations, templates, names): """This function extracts the sequences from annotated sequences based on their names Parameters ---------- annotations: list list of annotations for sequences that will be extracted templates: list of Bio.SeqRecord.SeqRecord A list of Bio.SeqRecord.SeqRecord with SeqFeatures names: str name of the sequence that will be extracted Returns ------- record: list of Bio.SeqRecord.SeqRecord list of extracted sites """ sites = [] for anno, template, name in zip(annotations, templates, names): for feature in template.features: # site = template[feature.location.start : feature.location.end] if str(feature.qualifiers["name"][0]) == anno: site = template[feature.location.start : feature.location.end] site.name = name # # If there is an batch anotation we can save it # if "batches" in site.annotations.keys(): # site.annotations["batches"].append( # template.annotations["batches"][0] # ) # else: # site.annotations["batches"] = [] # site.annotations["batches"].append(template.annotations) site.annotations = template.annotations sites.append(site) return sites
[docs]def seq_to_annotation(seqrec_from, seqrec_onto, aType): """Anotate an amplicon object from another amplicon object. Parameters ---------- seqrec_from: str annotation sequence that will be extracted seqrec_onto: list of Bio.SeqRecord.SeqRecord A list of Bio.SeqRecord.SeqRecord with SeqFeatures aType: str name of the sequence that will be extracted Returns ------- record: list of Bio.SeqRecord.SeqRecord list of extracted sites """ seq_from = seqrec_from.seq.watson.upper() seq_onto = seqrec_onto.seq.watson.upper() strand = 1 match_index = seq_onto.find(seq_from) # if there is match if match_index != -1: start = match_index end = start + len(seq_from) else: # If no match we look at the reverse complement seq_onto = Seq(seq_onto) rev_match_index = seq_onto.reverse_complement().find(seq_from) # if we get a match here if rev_match_index != -1: strand = -1 reclength = len(str(seqrec_onto.seq)) end = reclength - rev_match_index start = end - len(seq_from) else: print( "no match! seq:" + str(seqrec_from.name) + "\nnot annealing to:" + str(seqrec_onto.name) ) # add the feature to the amplicon feature = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(start, end), type=aType, strand=strand ) feature.qualifiers["label"] = seqrec_from.id seqrec_onto.features.append(feature)
[docs]def USER_enzyme(amplicon): """Simulates digestion with USER enzyme. Parameters ---------- amplicon: pydna.amplicon.Amplicon An pydna.amplicon.Amplicon to with Uracil integrated Returns ------- Dseqrecord USER digested Dseqrecord with USER tails """ fw_U_idx = amplicon.forward_primer.seq.find("U") # fw_U_idx rv_U_idx = amplicon.reverse_primer.seq.find("U") # rv_U_idx digested_watson = amplicon.seq.watson[fw_U_idx + 1 :] # digested_watson digested_crick = amplicon.seq.crick[rv_U_idx + 1 :] # digested_crick digested_pcr = pydna.dseqrecord.Dseqrecord( pydna.dseq.Dseq(watson=digested_watson, crick=digested_crick), features=amplicon.features, annotations=amplicon.annotations, ) # digested_pcr.seq return digested_pcr
[docs]def nicking_enzyme(vector): """Nt.Bbc.CI (nicking enzyme, Nicks) a vector with the sequence 'CGCGTG' on watson and 'CGCACG' on crick strand. Parameters ---------- vector: Dseq digested Dseqrecord - usually with AsiSI or similar overhang Returns ------- Dseq with nick - ready for USER cloning """ if vector.seq[0:8].watson == "CGCGTG" and vector.seq.crick[:6] == "CGCACG": return Dseq(watson=vector.seq.watson[6:], crick=vector.seq.crick[6:], ovhg=8) else: print("No nicking sequnce")
[docs]def casembler( bg_strain, site_names=None, gRNAs=None, parts=None, assembly_limits=None, assembly_names=None, verbose=False, to_benchling=False, ): """Simulate in vivo assembly and integration with the possibility of printing to gb files or send it directly to benchling. Parameters ---------- bg_strain : GenBank strain of choice eg. genbank file site_names: list list of names e.g. [X-3, XI-3] gRNAs: Seqrecords list of 20 bp seqrecords e.g. [ATF1_gRNA, CroCPR_gRNA] parts: list list of list of parts e.g. [[ATF1_repair_template],[CPR_repair_template]] assembly_limits: list list of numbers of bp assembly limits e.g. [200,400] assembly_names: list list of names of DNA post assembly e.g.["X_3_tADH1_P2_pPGK1", "XI_3_UP_DW"] verbose: bool write DNA e.g. False to_benchling: bool upload DNA to benchling e.g. False Returns ------- One dseqrecord of assembled contig """ assemblies = [] for int_no in range(0, len(gRNAs)): gRNA = gRNAs[int_no] site_name = site_names[int_no] site = extract_sites([site_name], [bg_strain], [site_name])[0] UP, DW = CAS9_cutting(gRNA, site) fragments = [UP] + parts[int_no] + [DW] assembly = pydna.assembly.Assembly( fragments, limit=assembly_limits[int_no] ).assemble_linear()[0] # sometimes pydna.assembly.Assembly distorts the start, end location of features to become negative which produces and error when printing. # This function is created as a workaround. # CPR assembly gives DW_XI_3 annotation called "DW_XI_3" a negative start location. # A quick workaround is to remove featuere remove_features_with_negative_loc(assembly) assembly.name = assembly_names[int_no] assembly_feat = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(0, len(assembly), strand=1), type="misc_feature", ) assembly_feat.qualifiers["name"] = site_names[int_no] assembly_feat.qualifiers["label"] = site_names[int_no] assembly.features.append(assembly_feat) if verbose: DNAs = [UP] + [assembly] + [DW] for DNA in DNAs: DNA.write("./" + DNA.name + ".gb") # "../data/processed/" if to_benchling: to_benchling(assembly, "to_benchling") assemblies.append(assembly) return functools.reduce(lambda x, y: x + y, assemblies)
[docs]def UPandDW(strain, isite_name, path_to_gRNA_table="../data/raw/gRNAtable.csv"): """Finds upstream and downstream sequences based on genome and site name. Parameters ---------- strain : str name of the strain eg. CENPK113-7d (you should specify path to the chromosome) isite_name : str a string of the site chomosomal site you want to retrieve Returns ------- UP_sites : list list of pydna.dseqrecord or pydna.amplicon.Amplicon DW_sites : list list of pydna.dseqrecord or pydna.amplicon.Amplicon """ # load lookup table gRNAtable = pd.read_csv(path_to_gRNA_table, index_col="name") chromosome_no = gRNAtable.loc[isite_name, "chromosome"] # load chromosome PathToChromosomeSeq = ( "../data/raw/" + strain + "/" + str(chromosome_no).zfill(2) + ".fa" ) ChromosomeSeq = Bio.SeqIO.read(PathToChromosomeSeq, "fasta").seq # define homology region location with respect to gRNA sequence # f_hom is the length of the UP homology # e_hom is the length of the DW homology # f_dist is distance from end of UP homology to the first base in isite_sequence # e_dist is distance from end of the isite_sequence to DW homology f_dist = gRNAtable.loc[isite_name, "f_dist"] e_dist = gRNAtable.loc[isite_name, "e_dist"] f_hom = gRNAtable.loc[isite_name, "f_hom"] e_hom = gRNAtable.loc[isite_name, "e_hom"] isite_sequence = gRNAtable.loc[isite_name, "sequence"] isite_sequence = Bio.Seq.Seq(isite_sequence) # Determine gRNA sequence strand gRNA_strand = 1 if ChromosomeSeq.find(isite_sequence) == -1: print("not on +1") gRNA_strand = -1 isite_sequence = isite_sequence.reverse_complement() if ChromosomeSeq.find(isite_sequence) == -1: print("not on -1") print("CAN'T FIND THE CUT SITE IN YOUR SEQUENCE") # Locate UP and DW StartIndex = ChromosomeSeq.find(isite_sequence) EndIndex = StartIndex UPseq = ChromosomeSeq[StartIndex + f_dist - f_hom : StartIndex + f_dist] DWseq = ChromosomeSeq[EndIndex + e_dist : EndIndex + e_dist + e_hom] UPrec = Bio.SeqRecord.SeqRecord(UPseq, name=isite_name + "UP") DWrec = Bio.SeqRecord.SeqRecord(DWseq, name=isite_name + "DW") # Annotate UP_feature = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(0, len(UPseq)), type="misc_feature", strand=+1 ) UP_feature.qualifiers["label"] = UPrec.name UPrec.features.append(UP_feature) DW_feature = Bio.SeqFeature.SeqFeature( Bio.SeqFeature.FeatureLocation(0, len(DWseq)), type="misc_feature", strand=+1 ) DW_feature.qualifiers["label"] = DWrec.name DWrec.features.append(DW_feature) return ([UPrec], [DWrec])
[docs]def multiply_list(myList): """Multiplies elements one by one. Parameters ---------- myList: list list of integers to be multiplied Returns ------- result : int """ result = 1 for x in myList: result = result * x return result
[docs]def remove_tuple_duplicates(lst: list) -> list: """Removes tuple duplicates Parameters ---------- lst: list list with duplicated elements Returns ------- list without duplicates """ return [t for t in (set(tuple(i) for i in lst))]
[docs]def recs_no_duplicates(recs_with_duplicates: list) -> list: """Removes duplicate sequences from a list. Parameters ---------- recs_with_duplicates: list list with duplicated elements Returns ------- list without duplicates """ seen_sequences = set() recs_no_dup = [] for rec in recs_with_duplicates: if rec.seq.watson not in seen_sequences: recs_no_dup.append(rec) seen_sequences.add(rec.seq.watson) return recs_no_dup
[docs]def recs_no_duplicates_names(recs_with_duplicates): """Removes duplicate names from a list Parameters ---------- recs_with_duplicates: list list with duplicated elements Returns ------- list without duplicates """ seen_names = set() recs_no_dup = [] for rec in recs_with_duplicates: if rec.name not in seen_names: recs_no_dup.append(rec) seen_names.add(rec.name) return recs_no_dup
[docs]def plate_plot(df, value): """Plots a 96 well plate as a pandas df. Parameters ---------- df: pd.Dataframe A pandas dataframe with value: pandas dataframe column name The name of the pandas dataframe coloumn that you want to display Returns ------- pd.Dataframe in a 96 well plate format of the chosen column Example ------- 1. Initialize: Amplicon_df = { name location template_name fw_name fw_location rv_name rv_location prow pcol 29 PCR_G8H_01 l5_A03 VminG8H_tADH1 PR_G8H_01 op4_A10 PR_G8H_02 op4_A01 A 1 25 PCR_G8H_05 l5_A07 SmusG8H_tADH1 PR_G8H_01 op4_A10 PR_G8H_06 op4_A02 A 2 21 PCR_G8H_09 l5_B02 RsepG8H_tADH1 PR_G8H_01 op4_A10 PR_G8H_10 op4_A03 A 3 17 PCR_G8H_13 l5_B06 CacuG8H_tADH1 PR_G8H_01 op4_A10 PR_G8H_14 op4_A04 A 4 13 PCR_G8H_17 l5_C01 OpumG8H_tADH1 PR_G8H_01 op4_A10 PR_G8H_18 op4_A05 A 5 } 2. Call the function: # here we call the name coloumn plate_plot(amplicon_df, 'name') <<Result: name pcol 1 2 3 4 5 6 7 8 9 10 11 12 prow A PCR_G8H_01 PCR_G8H_05 PCR_G8H_09 PCR_G8H_13 PCR_G8H_17 PCR_G8H_21 PCR_G8H_25 PCR_G8H_29 PCR_G8H_33 PCR_UP_tADH1_01 PCR_PRO_01 NaN1 B PCR_G8H_02 PCR_G8H_06 PCR_G8H_10 PCR_G8H_14 PCR_G8H_18 PCR_G8H_22 PCR_G8H_26 PCR_G8H_30 PCR_G8H_34 PCR_TRP1-DW_02 PCR_PRO_02 NaN2 C PCR_G8H_03 PCR_G8H_07 PCR_G8H_11 PCR_G8H_15 PCR_G8H_19 PCR_G8H_23 PCR_G8H_27 PCR_G8H_31 PCR_G8H_35 PCR_TRP1-DW_01 PCR_PRO_03 NaN3 D PCR_G8H_04 PCR_G8H_08 PCR_G8H_12 PCR_G8H_16 PCR_G8H_20 PCR_G8H_24 PCR_G8H_28 PCR_G8H_32 PCR_G8H_36 NaN4 PCR_PRO_04 NaN5 E PCR_CPR_01 PCR_CPR_10 PCR_CPR_03 PCR_CPR_09 PCR_CPR_02 PCR_CPR_06 PCR_CPR_07 PCR_CPR_08 PCR_CPR_04 PCR_CPR_05 PCR_PRO_05 NaN6 F PCR_CPR_11 PCR_CPR_20 PCR_CPR_13 PCR_CPR_19 PCR_CPR_12 PCR_CPR_16 PCR_CPR_17 PCR_CPR_18 PCR_CPR_14 PCR_CPR_15 PCR_PRO_06 NaN7 G PCR_CPR_21 PCR_CPR_30 PCR_CPR_23 PCR_CPR_29 PCR_CPR_22 PCR_CPR_26 PCR_CPR_27 PCR_CPR_28 PCR_CPR_24 PCR_CPR_25 PCR_PRO_07 NaN8 H PCR_CPR_31 PCR_CPR_40 PCR_CPR_33 PCR_CPR_39 PCR_CPR_32 PCR_CPR_36 PCR_CPR_37 PCR_CPR_38 PCR_CPR_34 PCR_CPR_35 PCR_PRO_08 NaN9 """ cols = [value, "prow", "pcol"] return df[cols].set_index(["prow", "pcol"]).unstack(level=-1)
[docs]def CRIPSR_knockout(gRNA_record, insertion_site, repair_DNA): """Simple version of casembler - Cuts the insertion site with CAS9_cutting and assembles knockout with a repair template. Parameters ---------- gRNA_record: pydna.dseqrecord. A 20 bp DNA sequence insertion_site: pydna.dseqrecord. The site to knock out repair_DNA: pydna.dseqrecord. Repair template. Typucally 90 bp or longer Returns ------- pydna.dseqrecord. Of assembled contig after CRISPR-mediated KO """ # Create fragments after CAS9 cut IS_UP, IS_DW = CAS9_cutting(gRNA_record, insertion_site) # create list of parts and assemble to knockout sequence assmeble_parts = IS_UP, repair_DNA, IS_DW assembled_knockout = Assembly(assmeble_parts).assemble_linear()[0] return assembled_knockout