from labtools.adtools.finder import pull_AD, pull_barcode
from labtools.adtools.seqlib import read_fastq, read_fastq_big
import pandas as pd
[docs]def seq_counter(fastq, design_to_use = None, barcoded = False, only_bcs = False, **kwargs):
"""Counts occurences of ADs or AD-barcode pairs in a fastq file.
Parameters
----------
fastq : str
Path to fastq or fastq.gz file.
design_to_use : str, default None
Path to csv file containing ArrayDNA column.
barcoded : bool, default False
Whether to count ADs with different barcodes separately.
only_bcs : default False
True, False or the barcode map to use. If True, no map is used.
**kwargs : dict
Add additional arguments to pass to pull_AD or pull_barcode.
Returns
----------
counts : pandas.core.series.Series
Pandas series where indices are AD or AD/barcode sequences and values are counts.
Examples
----------
>>> seq_counter("../exampledata/mini.fastq")
GGTTCTTCTAAATTGAGATGTGATAATAATGCTGCTGCTCATGTTAAATTGGATTCATTTCCAGCTGGTGTTAGATTTGATACATCTGATGAAGAATTGTTGGAACATTTGGCTGCTAAA 1
GAAGAATTGTTTTTACATTTGTCTGCTAAGATTGGTAGATCTTCTAGGAAACCACATCCATTCTTGGATGAATTTATTCATACTTTGGTTGAAGAAGATGGTATTTGTAGAACTCATCCA 3
dtype: int64
"""
seqCounts = {}
# merge lists of fastq files pertaining to the same sample
if type(fastq) == list:
# compute by only counting barcodes
if only_bcs != False and design_to_use == None:
for file in fastq:
for line in read_fastq_big(file, **kwargs):
bc = pull_barcode(line[1],**kwargs)
if bc not in seqCounts and bc != None: seqCounts[bc] = 1
elif bc != None: seqCounts[bc] += 1
counts = pd.Series(seqCounts)
# deny barcode search if a design file is provided
elif only_bcs != False and design_to_use != None:
raise TypeError("Using a design file is not compatible with only barcodes.")
# compute by counting ADs or ADs + barcodes
else:
for file in fastq:
for line in read_fastq_big(file, **kwargs):
AD,bc = pull_AD(line[1], barcoded, **kwargs)
if barcoded and AD != None:
pair = (AD, bc)
if pair not in seqCounts and pair[0] != None: seqCounts[AD] = 1
elif pair[0] != None: seqCounts[AD] += 1
elif AD != None and AD not in seqCounts: seqCounts[AD] = 1
elif AD != None: seqCounts[AD] += 1
counts = pd.Series(seqCounts)
# fastq files are not provided in lists (one file per sample)
else:
# compute by only counting barcodes
if only_bcs != False and design_to_use == None:
for line in read_fastq_big(fastq, **kwargs):
bc = pull_barcode(line[1],**kwargs)
if bc not in seqCounts and bc != None: seqCounts[bc] = 1
elif bc != None: seqCounts[bc] += 1
counts = pd.Series(seqCounts)
# deny barcode search if a design file is provided
elif only_bcs != False and design_to_use != None:
raise TypeError("Using a design file is not compatible with only barcodes.")
# compute by counting ADs or ADs + barcodes
else:
for line in read_fastq_big(fastq, **kwargs):
AD,bc = pull_AD(line[1], barcoded, **kwargs)
if barcoded and AD != None:
pair = (AD, bc)
if pair not in seqCounts and pair[0] != None: seqCounts[AD] = 1
elif pair[0] != None: seqCounts[AD] += 1
elif AD != None and AD not in seqCounts: seqCounts[AD] = 1
elif AD != None: seqCounts[AD] += 1
counts = pd.Series(seqCounts)
# remove non-perfect matches if required
if design_to_use:
design = pd.read_csv(design_to_use)
if barcoded:
counts = counts.where(counts.index.droplevel(1).isin(design.ArrayDNA)).dropna()
else:
counts = counts.where(counts.index.isin(design.ArrayDNA)).dropna()
return counts
[docs]def create_map(ad_bcs, filter = False):
"""Converts output of seq_counter with AD,bc pairs to a dict map.
If the barcode is found with two different ADs, it is not included in
the dictionary.
Parameters
----------
ad_bcs : pd.Series
output counts from seq_counter with barcoded = True.
filter : int, default False
Number of reads below which to ignore the barcode.
Returns
----------
bc_dict : dict
Dictionary with barcodes as keys and 1 AD as value.
"""
bc_dict = {}
bad_bcs = 0
for line in zip(ad_bcs.index, ad_bcs):
ad = line[0][0]
bc = line[0][1]
count = line[1]
if filter:
if count < filter:
pass
elif count >= filter:
if bc not in bc_dict:
bc_dict[bc] = ad
elif bc in bc_dict and bc_dict[bc] == ad:
pass
elif bc in bc_dict and bc_dict[bc] != ad:
del bc_dict[bc]
bad_bcs += 1
else:
if bc not in bc_dict:
bc_dict[bc] = ad
elif bc in bc_dict and bc_dict[bc] == ad:
pass
elif bc in bc_dict and bc_dict[bc] != ad:
del bc_dict[bc]
bad_bcs += 1
return(bc_dict, bad_bcs)
[docs]def convert_bcs_from_map(bcs, bc_dict):
"""Takes bc only data and uses a barcode dictionary to return AD counts.
If the barcode is found with two different ADs, it is not included in
the dictionary.
Parameters
----------
bcs : pd.Series
output counts from seq_counter with only_bcs = True.
bc_dict : dict
Dictionary with barcodes as keys and 1 AD as value from create_map().
Returns
----------
converted : pd.Series
Pandas series where indices are AD sequences and values are counts.
"""
ads = []
for bc in bcs.index:
ad = None
if bc in bc_dict:
ad = bc_dict[bc]
ads.append(ad)
ad_col = pd.Series(ads)
x = pd.DataFrame(bcs).reset_index()
x["AD"] = ads
converted = x[[0, "AD"]].groupby("AD").sum()[0]
return converted
[docs]def sort_normalizer(pair_counts, bin_counts, thresh = 10):
"""Normalize by reads per sample, reads per tile and reads per bin.
Parameters
----------
pair_counts : list of pandas.core.series.Series
List of pandas series where indices are AD or AD/barcode sequences and values are counts.
bin_counts : list
List of number of cells per bin in the same order as the pair counts.
thresh : int, default 10
Number of reads above which to count the unique sequence.
Returns
----------
df : pandas.DataFrame
Pandas dataframe containing the normalized read counts.
numreads : pandas.DataFrame
Total read counts for each unique sequence.
reads : pandas.DataFrame
Read counts per bin for each unique sequence.
Examples
----------
>>> sort_normalizer([count1, count2], [1000,1000])
"""
df = pd.DataFrame(pair_counts)
df.fillna(0, inplace=True)
# 10 is the read minimum, should make this user defined
df = df.loc[:, (df.sum() > thresh)]
df = df.transpose()
numreads = df.sum(axis = 1)
reads = df.copy(deep = True)
#df = df_in.copy(deep=True)
# row i column j
for j in df:
df[j] = (df[j]/df[j].sum())/bin_counts[j]
for i, pair in enumerate(df.index):
df.iloc[i] = df.iloc[i]/df.iloc[i].sum()
return df, numreads, reads
[docs]def calculate_activity(df_in, bin_values, min_max = False):
"""Calculate the activity of a normalized sort df.
Parameters
----------
df_in : pandas.DataFrame
Dataframe output of sort_normalizer()
bin_values : list
List of mean or median fluorescence values per bin in the same order as the pair counts.
min_max : bool, default False
Whether to normalize the activity using min 0 max 1.
Returns
----------
df : pandas.DataFrame
Pandas dataframe containing the activity values per sequence or sequence-barcode pair.
"""
df = df_in.copy(deep=True)
activities = df_in.dot(bin_values)
if min_max:
activities = minmax_scale(activities)
df.loc[:,"Activity"] = activities
return df
# def seq_counter_parallel(fastq, design_to_use = None, barcoded = False, only_bcs = False, **kwargs):
# """Counts occurences of ADs or AD-barcode pairs in a fastq file in parallel. THIS IS BETA
# Parameters
# ----------
# fastq : str
# Path to fastq or fastq.gz file.
# design_to_use : str, default None
# Path to csv file containing ArrayDNA column.
# barcoded : bool, default False
# Whether to count ADs with different barcodes separately.
# only_bcs : default False
# True, False or the barcode map to use. If True, no map is used.
# Returns
# ----------
# counts : pandas.core.series.Series
# Pandas series where indices are AD or AD/barcode sequences and values are counts.
# Examples
# ----------
# >>> seq_counter("../exampledata/mini.fastq")
# GGTTCTTCTAAATTGAGATGTGATAATAATGCTGCTGCTCATGTTAAATTGGATTCATTTCCAGCTGGTGTTAGATTTGATACATCTGATGAAGAATTGTTGGAACATTTGGCTGCTAAA 1
# GAAGAATTGTTTTTACATTTGTCTGCTAAGATTGGTAGATCTTCTAGGAAACCACATCCATTCTTGGATGAATTTATTCATACTTTGGTTGAAGAAGATGGTATTTGTAGAACTCATCCA 3
# dtype: int64
# """
# seqCounts = {}
# def helper(file, design_to_use, barcoded, only_bcs, seqCounts, **kwargs):
# for line in read_fastq_big(file, **kwargs):
# AD,bc = pull_AD(line[1], barcoded, **kwargs)
# if barcoded and AD != None:
# AD = (AD, bc)
# if AD not in seqCounts and AD != None: seqCounts[AD] = 1
# elif AD != None: seqCounts[AD] += 1
# return seqCounts
# # merge lists of fastq files pertaining to the same sample
# if type(fastq) == list:
# for file in fastq:
# for line in read_fastq_big(file, **kwargs):
# seqCounts = helper(file, design_to_use, barcoded, only_bcs, seqCounts, **kwargs)
# counts = pd.Series(seqCounts)
# # remove non-perfect matches if required
# if design_to_use:
# design = pd.read_csv(design_to_use)
# if barcoded:
# counts = counts.where(counts.index.droplevel(1).isin(design.ArrayDNA)).dropna()
# else:
# counts = counts.where(counts.index.isin(design.ArrayDNA)).dropna()
# return counts
if __name__ == '__main__':
main