Source code for constrain.lims.csv_database

# MIT License
# Copyright (c) 2022, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" Easy to use functions to fetch sequences and objects from local csv database"""

import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import numpy as np


[docs]def get_unique_id(path="../data/csv_database") -> int:
    """Makes a single unique ID from the csv_database files.

    Parameters
    ----------
    path : str
        path to the local csv database
    Returns
    -------
    Unique_id : int
        a unique ID starting from 10000

    """
    # get list of files
    files = os.listdir(path)

    # get csv files - sometimes there is a jupyter file or something here
    csv_files = []
    for csv in files:
        if csv.endswith("csv"):
            csv_files.append(csv)
    # Pandas dataframe
    list_of_ids = []
    for df in csv_files:
        dataframe = pd.read_csv(path + "/" + df)
        list_of_ids += dataframe["ID"].to_list()

    # making an array of the and removing nan
    arr = np.array(list_of_ids)
    new_data = arr[~np.isnan(arr)]

    # If it is the first ID
    if new_data.size == 0:
        return 10000
    else:
        return int(np.amax(new_data)) + 1


[docs]def add_sequences_to_dataframe(list_of_DNA: list, csv_database_as_df, index=0) -> None:
    """Adds sequences to local csv databse.

    Parameters
    ----------
    list_of_DNA : list
        BioSeqrecord objects

    csv_database_as_df : pd.DataFrame
        your temporary csv database made into a pandas dataframe

    index : int
        designating which index you want the dna i.e could choose index= 288 which is plate 3 A1

    Returns
    -------
    None
        updates the dateframe with your sequences
    """
    counter = 0
    for ds_dna in list_of_DNA:
        # find index x
        if index != 0:
            blank_row_index = index + counter
            counter += 1
        else:
            # Find blank id spot
            blank_row_bool = csv_database_as_df.loc[:, "ID"].isna()  # get NaN records
            blank_row_index = [i + index for i, x in enumerate(blank_row_bool) if x][
                0
            ]  # get first index with nan ID

        # Changing the dataframe
        csv_database_as_df.loc[blank_row_index, "ID"] = int(ds_dna.id)
        csv_database_as_df.loc[blank_row_index, "description"] = ds_dna.description
        csv_database_as_df.loc[blank_row_index, "size"] = len(ds_dna.seq)
        csv_database_as_df.loc[blank_row_index, "seq"] = str(ds_dna.seq)
        csv_database_as_df.loc[blank_row_index, "date"] = str(
            pd.to_datetime("today").strftime("%m-%d-%Y")
        )
        csv_database_as_df.loc[blank_row_index, "name"] = ds_dna.name
        csv_database_as_df.loc[blank_row_index, "features"] = str(ds_dna.features)

        # annotations
        csv_database_as_df.loc[blank_row_index, "concentration"] = ds_dna.annotations[
            "batches"
        ][0]["concentration"]
        csv_database_as_df.loc[blank_row_index, "volume"] = ds_dna.annotations[
            "batches"
        ][0]["volume"]
        csv_database_as_df.loc[blank_row_index, "location"] = ds_dna.annotations[
            "batches"
        ][0]["location"]

        csv_database_as_df.loc[blank_row_index, "comments"] = ds_dna.annotations[
            "comments"
        ]
        csv_database_as_df.loc[blank_row_index, "reference"] = ds_dna.annotations[
            "reference"
        ]


[docs]def get_plate(plate_number: int, csv_database_as_df):
    """Returns the plate from a specified csv_database.
    Parameters
    ----------
    plate_number : int
        designating which plate to fetch
    csv_database_as_df : pd.DataFrame
        your temporary csv database made into a pandas dataframe

    Returns
    -------
    pd.Dataframe
        dataframe with the specified plate

    """
    plate = csv_database_as_df.loc[csv_database_as_df["plate"] == plate_number]

    return plate


[docs]def get_box(box_number: int, csv_database_as_df):
    """Returns the plate from a specified csv_database.
    Parameters
    ----------
    box_number : int
        designating which plate to fetch
    csv_database_as_df : pd.DataFrame
        your temporary csv database made into a pandas dataframe

    Returns
    -------
    pd.Dataframe
        dataframe with the specified plate
    """
    box = csv_database_as_df.loc[csv_database_as_df["box"] == box_number]

    return box


[docs]def add_unique_ids(list_of_parts: list, path="../data/csv_database") -> None:
    """Adds unique ids to a list of SeqRecords."""
    for i in range(len(list_of_parts)):
        unique_id = get_unique_id(path) + i
        list_of_parts[i].id = str(unique_id)


[docs]def add_annotations(
    list_of_parts: list,
    concentration: float = 0.0,
    reference: str = "",
    volume: float = 0.0,
    comments: str = "",
    location: str = "",
) -> list:

    """Adds the neccessary annotations to a list of
    SeqRecord objects to be uploaded to the database"""
    for annotations in list_of_parts:
        annotations.annotations = {
            "reference": reference,
            "comments": comments,
            "batches": [
                {
                    "location": location,
                    "volume": float(volume),
                    "concentration": float(concentration),
                }
            ],
        }
    return list_of_parts


[docs]def update_database(
    dataframe, which_database: str, path="../data/csv_database/"
) -> None:
    """Updates the database of choosing"""
    dataframe.to_csv(
        path + which_database + ".csv",
        index=False,
    )


[docs]def get_dna_from_plate_name(
    name: str,
    database_name: str,
    database_path="../data/csv_database/",
    genbank_files_path="../data/genbank_files/",
    genbank=False,
) -> SeqRecord:
    """fetch dna based on the name from the PLATE database of choice.

    Parameters
    ----------
    name : str
        name of the sequence you want to fetch
    database_name : str
        name of the database u want the sequence to be fetched from
    genbank_files_path : str
        filepath to your genbank files
    genbank : bool
        if True the function will fetch thegenbank file based on the unique ID.

    Returns
    -------
    Record : Bio.SeqRecord
        biopython object with values attached to its instances.
    """

    # initialize
    path = database_path + database_name + ".csv"
    dataframe = pd.read_csv(path)

    # find index of occurences of name in the dataframe and reset index
    found_the_record_df = dataframe.loc[
        dataframe["name"] == name
    ].reset_index()  # .index[0]

    # Fetching
    if genbank:
        ID_from_df = str(int(found_the_record_df.loc[0, "ID"]))
        Record = SeqIO.read(genbank_files_path + ID_from_df + ".gb", format="gb")
        Record.annotations = {
            "plate": found_the_record_df.loc[0, "plate"],
            "row": found_the_record_df.loc[0, "row"],
            "col": found_the_record_df.loc[0, "col"],
            # adding the batches
            "batches": [
                {
                    "location": str(found_the_record_df.loc[0, "location"])
                    + "_"
                    + str(found_the_record_df.loc[0, "plate"])
                    + "_"
                    + str(found_the_record_df.loc[0, "row"])
                    + str(found_the_record_df.loc[0, "col"]),
                    "volume": float(found_the_record_df.loc[0, "volume"]),
                    "concentration": float(found_the_record_df.loc[0, "concentration"]),
                }
            ],
        }
    else:
        # Taking data from the df
        Record = SeqRecord(Seq(str(found_the_record_df.loc[0, "seq"])))
        Record.id = str(found_the_record_df.loc[0, "ID"])
        Record.name = found_the_record_df.loc[0, "name"]
        Record.description = found_the_record_df.loc[0, "description"]
        Record.annotations = {
            "plate": found_the_record_df.loc[0, "plate"],
            "row": found_the_record_df.loc[0, "row"],
            "col": found_the_record_df.loc[0, "col"],
            # adding the batches
            "batches": [
                {
                    "location": str(found_the_record_df.loc[0, "location"])
                    + "_"
                    + str(found_the_record_df.loc[0, "plate"])
                    + "_"
                    + str(found_the_record_df.loc[0, "row"])
                    + str(found_the_record_df.loc[0, "col"]),
                    "volume": float(found_the_record_df.loc[0, "volume"]),
                    "concentration": float(found_the_record_df.loc[0, "concentration"]),
                }
            ],
        }

    return Record


[docs]def get_dna_from_box_name(
    name: str,
    database_name: str,
    database_path="../data/csv_database/",
    genbank_files_path="../data/genbank_files/",
    genbank=False,
) -> SeqRecord:
    """fetch dna based on the name from the BOX database of choice.

    Parameters
    ----------
    name : str
        name of the sequence you want to fetch
    database_name : str
        name of the database u want the sequence to be fetched from
    genbank_files_path : str
        filepath to your genbank files
    genbank : bool
        if True the function will fetch thegenbank file based on the unique ID.

    Returns
    -------
    Record : Bio.SeqRecord
        biopython object with values attached to its instances.
    """

    # initialize
    path = database_path + database_name + ".csv"
    dataframe = pd.read_csv(path)

    # find index of occurences of name in the dataframe and reset index
    found_the_record_df = dataframe.loc[
        dataframe["name"] == name
    ].reset_index()  # .index[0]

    # Fetching
    if genbank:
        ID_from_df = str(int(found_the_record_df.loc[0, "ID"]))
        Record = SeqIO.read(genbank_files_path + ID_from_df + ".gb", format="gb")
        Record.annotations = {
            "box": found_the_record_df.loc[0, "box"],
            "row": found_the_record_df.loc[0, "row"],
            "col": found_the_record_df.loc[0, "col"],
            # adding the batches
            "batches": [
                {
                    "location": str(found_the_record_df.loc[0, "location"])
                    + "_"
                    + str(found_the_record_df.loc[0, "box"])
                    + "_"
                    + str(found_the_record_df.loc[0, "row"])
                    + str(found_the_record_df.loc[0, "col"]),
                    "volume": float(found_the_record_df.loc[0, "volume"]),
                    "concentration": float(found_the_record_df.loc[0, "concentration"]),
                }
            ],
        }
    else:
        # Taking data from the df
        Record = SeqRecord(Seq(str(found_the_record_df.loc[0, "seq"])))
        Record.id = str(found_the_record_df.loc[0, "ID"])
        Record.name = found_the_record_df.loc[0, "name"]
        Record.description = found_the_record_df.loc[0, "description"]
        Record.annotations = {
            "box": found_the_record_df.loc[0, "box"],
            "row": found_the_record_df.loc[0, "row"],
            "col": found_the_record_df.loc[0, "col"],
            # adding the batches
            "batches": [
                {
                    "location": str(found_the_record_df.loc[0, "location"])
                    + "_"
                    + str(found_the_record_df.loc[0, "box"])
                    + "_"
                    + str(found_the_record_df.loc[0, "row"])
                    + str(found_the_record_df.loc[0, "col"]),
                    "volume": float(found_the_record_df.loc[0, "volume"]),
                    "concentration": float(found_the_record_df.loc[0, "concentration"]),
                }
            ],
        }

    return Record


[docs]def get_database(name: str, path="../data/csv_database/"):
    """Fetches the csv database as a pd.dataframe"""
    try:
        dataframe = pd.read_csv(path + name + ".csv", index_col=False)
        return dataframe
    except:
        print("Couldnt find that databse. Hack: Dont add csv extention.")


[docs]def change_row(row_index: int, csv_database_as_df, biopython_object):
    """inserts a biopyton object into the database at a specific index"""

    # Changing the dataframe
    csv_database_as_df.loc[row_index, "ID"] = int(biopython_object.id)
    csv_database_as_df.loc[row_index, "description"] = biopython_object.description
    csv_database_as_df.loc[row_index, "size"] = len(biopython_object.seq)
    csv_database_as_df.loc[row_index, "seq"] = str(biopython_object.seq)
    csv_database_as_df.loc[row_index, "date"] = str(
        pd.to_datetime("today").strftime("%m-%d-%Y")
    )
    csv_database_as_df.loc[row_index, "name"] = biopython_object.name
    csv_database_as_df.loc[row_index, "features"] = str(biopython_object.features)

    # annotations
    csv_database_as_df.loc[row_index, "concentration"] = biopython_object.annotations[
        "batches"
    ][0]["concentration"]
    csv_database_as_df.loc[row_index, "volume"] = biopython_object.annotations[
        "batches"
    ][0]["volume"]

    csv_database_as_df.loc[row_index, "comments"] = biopython_object.annotations[
        "comments"
    ]
    csv_database_as_df.loc[row_index, "reference"] = biopython_object.annotations[
        "reference"
    ]

    return csv_database_as_df


[docs]def delete_row_df(row_index, which_df):
    """Deletes a row in the database without changing the namse of"""

    coloumns = [
        "ID",
        "name",
        "size",
        "seq",
        "concentration",
        "features",
        "location",
        "reference",
        "volume",
        "comments",
        "description",
        "date",
    ]
    which_df.loc[row_index, coloumns] = np.nan