Source code for labtools.adtools.counter

from labtools.adtools.finder import pull_AD, pull_barcode
from labtools.adtools.seqlib import read_fastq, read_fastq_big
import pandas as pd

[docs]def seq_counter(fastq, design_to_use = None, barcoded = False, only_bcs = False, **kwargs):
    """Counts occurences of ADs or AD-barcode pairs in a fastq file.
    
    Parameters
    ----------
    fastq : str 
        Path to fastq or fastq.gz file.
    design_to_use : str, default None
        Path to csv file containing ArrayDNA column.
    barcoded : bool, default False
        Whether to count ADs with different barcodes separately.
    only_bcs : default False
        True, False or the barcode map to use. If True, no map is used.
    **kwargs : dict
        Add additional arguments to pass to pull_AD or pull_barcode.
    
    Returns
    ----------
    counts : pandas.core.series.Series
        Pandas series where indices are AD or AD/barcode sequences and values are counts.
    
    Examples
    ----------
    >>> seq_counter("../exampledata/mini.fastq")
    GGTTCTTCTAAATTGAGATGTGATAATAATGCTGCTGCTCATGTTAAATTGGATTCATTTCCAGCTGGTGTTAGATTTGATACATCTGATGAAGAATTGTTGGAACATTTGGCTGCTAAA    1
    GAAGAATTGTTTTTACATTTGTCTGCTAAGATTGGTAGATCTTCTAGGAAACCACATCCATTCTTGGATGAATTTATTCATACTTTGGTTGAAGAAGATGGTATTTGTAGAACTCATCCA    3
    dtype: int64
    """
    seqCounts = {}

    # merge lists of fastq files pertaining to the same sample
    if type(fastq) == list:
        # compute by only counting barcodes
        if only_bcs != False and design_to_use == None:
            for file in fastq:
                for line in read_fastq_big(file, **kwargs):
                    bc = pull_barcode(line[1],**kwargs)
                    
                    if bc not in seqCounts and bc != None: seqCounts[bc] = 1
                    elif bc != None: seqCounts[bc] += 1
                counts = pd.Series(seqCounts)
        # deny barcode search if a design file is provided
        elif only_bcs != False and design_to_use != None:
            raise TypeError("Using a design file is not compatible with only barcodes.")
        # compute by counting ADs or ADs + barcodes
        else:        
            for file in fastq:
                for line in read_fastq_big(file, **kwargs):
                    AD,bc = pull_AD(line[1], barcoded, **kwargs)
                    
                    if barcoded and AD != None:
                        pair = (AD, bc)
                    
                        if pair not in seqCounts and pair[0] != None: seqCounts[AD] = 1
                        elif pair[0] != None: seqCounts[AD] += 1

                    elif AD != None and AD not in seqCounts: seqCounts[AD] = 1
                    elif AD != None: seqCounts[AD] += 1
            counts = pd.Series(seqCounts)
    # fastq files are not provided in lists (one file per sample)
    else:
        # compute by only counting barcodes
        if only_bcs != False and design_to_use == None:
            for line in read_fastq_big(fastq, **kwargs):
                bc = pull_barcode(line[1],**kwargs)
                
                if bc not in seqCounts and bc != None: seqCounts[bc] = 1
                elif bc != None: seqCounts[bc] += 1
            counts = pd.Series(seqCounts)
        # deny barcode search if a design file is provided
        elif only_bcs != False and design_to_use != None:
            raise TypeError("Using a design file is not compatible with only barcodes.")
        # compute by counting ADs or ADs + barcodes
        else:
            for line in read_fastq_big(fastq, **kwargs):
                AD,bc = pull_AD(line[1], barcoded, **kwargs)
                
                if barcoded and AD != None:
                    pair = (AD, bc)
                
                    if pair not in seqCounts and pair[0] != None: seqCounts[AD] = 1
                    elif pair[0] != None: seqCounts[AD] += 1
                    
                elif AD != None and AD not in seqCounts: seqCounts[AD] = 1
                elif AD != None: seqCounts[AD] += 1
            counts = pd.Series(seqCounts)
    
    # remove non-perfect matches if required
    if design_to_use:
        design = pd.read_csv(design_to_use)
        if barcoded:
            counts = counts.where(counts.index.droplevel(1).isin(design.ArrayDNA)).dropna()
        else:
            counts = counts.where(counts.index.isin(design.ArrayDNA)).dropna()
    return counts

[docs]def create_map(ad_bcs, filter = False):
    """Converts output of seq_counter with AD,bc pairs to a dict map.
    
    If the barcode is found with two different ADs, it is not included in 
    the dictionary.

    Parameters
    ----------
    ad_bcs : pd.Series 
        output counts from seq_counter with barcoded = True.
    filter : int, default False
        Number of reads below which to ignore the barcode.

    Returns
    ----------
    bc_dict : dict
        Dictionary with barcodes as keys and 1 AD as value.
    """
    bc_dict = {}
    bad_bcs = 0
    for line in zip(ad_bcs.index, ad_bcs):
        ad = line[0][0]
        bc = line[0][1]
        count = line[1]
        if filter:
            if count < filter:
                pass
            elif count >= filter:
                if bc not in bc_dict:
                    bc_dict[bc] = ad
                elif bc in bc_dict and bc_dict[bc] == ad:
                    pass
                elif bc in bc_dict and bc_dict[bc] != ad:
                    del bc_dict[bc]
                    bad_bcs += 1
        else:
            if bc not in bc_dict:
                bc_dict[bc] = ad
            elif bc in bc_dict and bc_dict[bc] == ad:
                pass
            elif bc in bc_dict and bc_dict[bc] != ad:
                del bc_dict[bc]
                bad_bcs += 1

    return(bc_dict, bad_bcs)

[docs]def convert_bcs_from_map(bcs, bc_dict):
    """Takes bc only data and uses a barcode dictionary to return AD counts.
    
    If the barcode is found with two different ADs, it is not included in 
    the dictionary.

    Parameters
    ----------
    bcs : pd.Series 
        output counts from seq_counter with only_bcs = True.
    bc_dict : dict
        Dictionary with barcodes as keys and 1 AD as value from create_map().

    Returns
    ----------
    converted : pd.Series
        Pandas series where indices are AD sequences and values are counts.
    """
    ads = []
    for bc in bcs.index:
        ad = None
        if bc in bc_dict:
            ad = bc_dict[bc]
        ads.append(ad)
    ad_col = pd.Series(ads)
    x = pd.DataFrame(bcs).reset_index()
    x["AD"] = ads
    converted = x[[0, "AD"]].groupby("AD").sum()[0]
    return converted


[docs]def sort_normalizer(pair_counts, bin_counts, thresh = 10):
    """Normalize by reads per sample, reads per tile and reads per bin.
    
    Parameters
    ----------
    pair_counts : list of pandas.core.series.Series 
        List of pandas series where indices are AD or AD/barcode sequences and values are counts.
    bin_counts : list
        List of number of cells per bin in the same order as the pair counts.
    thresh : int, default 10
        Number of reads above which to count the unique sequence.
    
    Returns
    ----------
    df : pandas.DataFrame
        Pandas dataframe containing the normalized read counts.
    numreads : pandas.DataFrame
        Total read counts for each unique sequence.
    reads : pandas.DataFrame
        Read counts per bin for each unique sequence.
    
    Examples
    ----------
    >>> sort_normalizer([count1, count2], [1000,1000])
    """
    df = pd.DataFrame(pair_counts)
    df.fillna(0, inplace=True)
    # 10 is the read minimum, should make this user defined
    df = df.loc[:, (df.sum() > thresh)]
    df = df.transpose()
    numreads = df.sum(axis = 1)
    reads = df.copy(deep = True)
    #df = df_in.copy(deep=True)
    # row i column j
    for j in df:
        df[j] = (df[j]/df[j].sum())/bin_counts[j]
    for i, pair in enumerate(df.index):
        df.iloc[i] = df.iloc[i]/df.iloc[i].sum()
    
    return df, numreads, reads

[docs]def calculate_activity(df_in, bin_values, min_max = False):
    """Calculate the activity of a normalized sort df.
    
    Parameters
    ----------
    df_in : pandas.DataFrame
        Dataframe output of sort_normalizer()
    bin_values : list
        List of mean or median fluorescence values per bin in the same order as the pair counts.
    min_max : bool, default False
        Whether to normalize the activity using min 0 max 1.
    
    Returns
    ----------
    df : pandas.DataFrame
        Pandas dataframe containing the activity values per sequence or sequence-barcode pair.
    """
    df = df_in.copy(deep=True)
    activities = df_in.dot(bin_values)
    if min_max:
        activities = minmax_scale(activities)
    
    df.loc[:,"Activity"] = activities
    return df

# def seq_counter_parallel(fastq, design_to_use = None, barcoded = False, only_bcs = False, **kwargs):
#     """Counts occurences of ADs or AD-barcode pairs in a fastq file in parallel. THIS IS BETA
    
#     Parameters
#     ----------
#     fastq : str 
#         Path to fastq or fastq.gz file.
#     design_to_use : str, default None
#         Path to csv file containing ArrayDNA column.
#     barcoded : bool, default False
#         Whether to count ADs with different barcodes separately.
#     only_bcs : default False
#         True, False or the barcode map to use. If True, no map is used.
    
#     Returns
#     ----------
#     counts : pandas.core.series.Series
#         Pandas series where indices are AD or AD/barcode sequences and values are counts.
    
#     Examples
#     ----------
#     >>> seq_counter("../exampledata/mini.fastq")
#     GGTTCTTCTAAATTGAGATGTGATAATAATGCTGCTGCTCATGTTAAATTGGATTCATTTCCAGCTGGTGTTAGATTTGATACATCTGATGAAGAATTGTTGGAACATTTGGCTGCTAAA    1
#     GAAGAATTGTTTTTACATTTGTCTGCTAAGATTGGTAGATCTTCTAGGAAACCACATCCATTCTTGGATGAATTTATTCATACTTTGGTTGAAGAAGATGGTATTTGTAGAACTCATCCA    3
#     dtype: int64
#     """
#     seqCounts = {}
    
#     def helper(file, design_to_use, barcoded, only_bcs, seqCounts, **kwargs):
#         for line in read_fastq_big(file, **kwargs):
#             AD,bc = pull_AD(line[1], barcoded, **kwargs)
                
#             if barcoded and AD != None:
#                 AD = (AD, bc)
#             if AD not in seqCounts and AD != None: seqCounts[AD] = 1
#             elif AD != None: seqCounts[AD] += 1
#         return seqCounts

#     # merge lists of fastq files pertaining to the same sample
#     if type(fastq) == list:
#         for file in fastq:
#             for line in read_fastq_big(file, **kwargs):
#                 seqCounts = helper(file, design_to_use, barcoded, only_bcs, seqCounts, **kwargs)
#         counts = pd.Series(seqCounts)

#     # remove non-perfect matches if required
#     if design_to_use:
#         design = pd.read_csv(design_to_use)
#         if barcoded:
#             counts = counts.where(counts.index.droplevel(1).isin(design.ArrayDNA)).dropna()
#         else:
#             counts = counts.where(counts.index.isin(design.ArrayDNA)).dropna()
#     return counts

[docs]def main():
    pass

if __name__ == '__main__':
    main