Source code for constrain.test.genotyping

#!/usr/bin/env python
# MIT License
# Copyright (c) 2022, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

import pandas as pd
import numpy as np
from Bio import pairwise2


[docs]def slicing_and_naming_seq_plates(sequencing_plates, where_to_slice=7) -> list:
    """Slices rows of a list of dataframes and changes the names.
    Is used to ease pre-processing of Plate2seq excel files

    Parameters
    ----------
    sequencing_plates : list of pd.DataFrames
        Plate2seq pd.dataframes
    where_to_slice : int
        indicate where to slice the dataframe

    Returns
    -------
    list of plates sliced pd.DataFrames

    """
    # changing column names and slicing
    for i in range(len(sequencing_plates)):
        sequencing_plates[i].columns = (
            "Number",
            "Sample-Name",
            "AvgQual",
            "Length",
            "GoodQualFrom",
            "GoodQualTo",
            "used",
        )  # sequencing_plates[0].iloc[6]
        sequencing_plates[i] = sequencing_plates[i][where_to_slice:]

    return sequencing_plates


[docs]def plat_seq_data_wrangler(sequencing_plates: list) -> list:
    """Makes list of Plate2Seq pd.DataFrames into numeric
    values and removes nan values.

    Parameters
    ----------
    sequencing_plates : list of pd.DataFrames
        Sliced Plate2seq pd.dataframes

    Returns
    -------
    Plate2Seq pd.DataFrames with numeric values

    """

    list_with_dfs = []

    for i in range(len(sequencing_plates)):
        # taking only a subset of the dataframe:
        numeric_values = sequencing_plates[i][
            ["AvgQual", "Length", "GoodQualFrom", "GoodQualTo", "used"]
        ]

        # if values are non nummeric make them NaN
        numeric_values = numeric_values.replace("n.a.", np.NaN)

        # Making them numeric
        numeric_values = numeric_values.apply(pd.to_numeric, errors="coerce")

        # Adding names column
        name_column = sequencing_plates[i]["Sample-Name"]
        number_column = sequencing_plates[i]["Number"]

        # Adding them to the dataframe
        data1 = pd.concat([number_column, name_column, numeric_values], axis=1)
        list_with_dfs.append(data1)

    return list_with_dfs


[docs]def plate_AvgQual(list_of_dfs_numeric: list, Avg_qual=50, used_bases=25) -> list:
    """Filters out rows that doesnt follow the criteria.

    Parameters
    ----------
    list_of_dfs_numeric : list of pd.DataFrames
        Sliced and Plate2seq pd.dataframes
    Avg_qual : int
    used_bases : int

    Returns
    -------
    Plate2Seq pd.DataFrames with that follows Avg_qual and used_bases criteria

    """
    # Initialize
    filtered_plates = []

    for i in range(len(list_of_dfs_numeric)):
        # Filter
        filter_Avg_qual = list_of_dfs_numeric[i][
            list_of_dfs_numeric[i]["AvgQual"] > Avg_qual
        ]
        filer_used_bases = filter_Avg_qual[filter_Avg_qual["used"] > used_bases]
        # Save the filtered plates
        filtered_plates.append(filer_used_bases)

    return filtered_plates


[docs]def split_df_names(
    df_names_column, which_column_to_split1=0, which_column_to_split2=2
) -> list:
    """Splits sample names from plate2seq pd.dataframes into plate and well columns"""
    df_with_names_split = []

    for i in range(len(df_names_column)):
        # splitting
        df_filter_plates = df_names_column[i]["Sample-Name"].str.split("_", expand=True)

        # selecting
        column1 = df_filter_plates[which_column_to_split1]
        column2 = df_filter_plates[which_column_to_split2]

        # concating
        concatenated = pd.concat(
            [df_names_column[i], column1, column2], axis=1, ignore_index=False
        )

        # changing names
        concatenated.columns = (
            "Number",
            "Sample-Name",
            "AvgQual",
            "Length",
            "GoodQualFrom",
            "GoodQualTo",
            "used",
            "plate",
            "well",
        )

        # save
        df_with_names_split.append(concatenated)

    return df_with_names_split


[docs]def concatenating_list_of_dfs(list_of_dfs: list):
    """Concatenating a list of daframes into one pd.dataframe by rows"""
    assembled_dfs = pd.concat(list_of_dfs, axis=0, ignore_index=False)

    return assembled_dfs


[docs]def pairwise_alignment_of_templates(
    reads: list, templates: list, primers: list
) -> dict:
    """Infers relationship of templates to reads based on highest
    score from a pairwise alignment.

    Parameters
    ----------
    reads: list of Bio.SeqRecord.SeqRecord
        these are .ab1 files made into Bio.SeqRecord.SeqRecord objects
    templates: list of Bio.SeqRecord.SeqRecord
        Templates for inferring relationship with - could be plasmid fx
    primers: list of Bio.SeqRecord.SeqRecord
        list of primers to be for finding were the read should start

    Returns
    -------
    pd.Dataframe in the following way:

    Example
    -------
    <<<df_alignment = pairwise_alignment_of_templates(reads,templates, primers_for_seq)

    <<< df_alignment

    Sample-Name	inf_promoter_name	align_score	inf_promoter
    132	yp53re_cpr_A10_A10-pad_cpr_fw	pCCW12	634.0	5
    188	yp53re_cpr_A11_A11-pad_cpr_fw	pTPI1	904.0	6
    247	yp53re_cpr_A12_A12-pad_cpr_fw	pTPI1	851.0	6
    93	yp53re_cpr_A1_A01-pad_cpr_fw	pCCW12	543.0	5
    41	yp53re_cpr_A2_A02-pad_cpr_fw	pCCW12	636.0	5

    Notes
    -----
    If you want inf_part_number column then change your the description
    of the Bio.SeqRecord.SeqRecord as follows:

    pCCW12.description = '1'
    """

    best_scores = []
    read_list = []
    template_list = []
    template_number_list = []

    for i in range(len(reads)):

        sample = reads[i].seq.replace("N", "")

        # If we see the primers in the sample we the alignment will start from there
        for k in range(len(primers)):
            start = sample.find(primers[k].seq)

            if start != -1:
                sample[start:]
            else:
                continue

        # Aling with templates
        if len(sample) > 25:
            score = 0.0
            for j in range(len(templates)):
                template = templates[j].seq

                # Align # identical = 1, non-identical = -2 , gap = -2 , extending gap = -2
                # alignments = pairwise2.align.globalms(template, sample,2, -2, -3, -3)
                alignment_score = pairwise2.align.localxx(template, sample, score_only= True)

                # Get the best alignment of them all
                if alignment_score > score:
                    score = alignment_score
                    temp_name = templates[j].name
                    temp_number = templates[j].description
                    read_name = reads[i].name

        # Saving the alignmets and their names
        best_scores.append(score)
        read_list.append(read_name)
        template_list.append(temp_name)
        template_number_list.append(temp_number)

    # Making a pandas. dataframe
    df = pd.DataFrame()
    df["Sample-Name"] = read_list
    df["inf_part_name"] = template_list
    df["align_score"] = best_scores
    df["inf_part_number"] = template_number_list

    return df