Source code for labtools.shuffle

import numpy as np
import random
import pandas as pd
import math
import more_itertools
from collections import Counter

[docs]def windowed_shuffle(seq, window_size=5, step_size=1, n_shuffles=1, short_last_window = False): """Return a number of uniquely shuffled sequences. If n_shuffles is higher than the number of possible unique shuffles in that window, only the unique shuffles are returned. Returns sequences with shuffled letters in lowercase. Parameters ---------- seq : str Sequence to shuffle. window_size : int, default 5 Size of shuffle region. step_size : int, default 1 Distance between window starts (default 1). n_shuffles : int, default 1 Number of shuffles to make for each region (default 1). short_last_window : bool, default False Whether last window can be less than window_size to include all letters (default False). Returns ---------- list A list of shuffled sequences. Examples ---------- >>> shuffles_list, names_list = shuffle.windowed_shuffle("ABCDEFGHIJKLMNOPQRSTUVWYXZ") ...shuffles_list ['decbaFGHIJKLMNOPQRSTUVWYXZ', 'AfdecbGHIJKLMNOPQRSTUVWYXZ', 'ABfcdegHIJKLMNOPQRSTUVWYXZ', 'ABCfdhgeIJKLMNOPQRSTUVWYXZ', 'ABCDhgeifJKLMNOPQRSTUVWYXZ', 'ABCDEifgjhKLMNOPQRSTUVWYXZ', 'ABCDEFhjgkiLMNOPQRSTUVWYXZ', 'ABCDEFGkjilhMNOPQRSTUVWYXZ', 'ABCDEFGHikmljNOPQRSTUVWYXZ', 'ABCDEFGHIjmlknOPQRSTUVWYXZ', 'ABCDEFGHIJomklnPQRSTUVWYXZ', 'ABCDEFGHIJKpomlnQRSTUVWYXZ', 'ABCDEFGHIJKLpoqnmRSTUVWYXZ', 'ABCDEFGHIJKLMronqpSTUVWYXZ', 'ABCDEFGHIJKLMNsqrpoTUVWYXZ', 'ABCDEFGHIJKLMNOrsqtpUVWYXZ', 'ABCDEFGHIJKLMNOPsutrqVWYXZ', 'ABCDEFGHIJKLMNOPQstruvWYXZ', 'ABCDEFGHIJKLMNOPQRvsutwYXZ', 'ABCDEFGHIJKLMNOPQRSuvwytXZ', 'ABCDEFGHIJKLMNOPQRSTxuvywZ', 'ABCDEFGHIJKLMNOPQRSTUwxzyv'] """ sequence_list = [] name_list = [] step_size = int(step_size) # determine the number of windows if short_last_window: rounder = math.ceil else: rounder = math.floor if window_size == step_size: n_windows = rounder(len(seq)/step_size) else: n_windows = rounder((len(seq)-window_size+step_size)/step_size) for window in range(0,n_windows): # determine first and last indices of the shuffle window start = 0+step_size*window end = start+window_size # divide the sequence into regions to shuffle or not shuffle pre_shuffle = seq[0:start] shuffle_region = seq[start:end] post_shuffle = seq[end:] # calculate the maximum number of unique shuffles for the given sequence # see explanation at https://getcalc.com/statistics-permutation-calculator.htm # this prevents the later while loop from looking for new shuffles when none exist max_unique_shuffles = math.factorial(len(shuffle_region)) \ /(np.prod([math.factorial(num) for num in list(Counter(shuffle_region).values())]))-1 n = n_shuffles if max_unique_shuffles < n_shuffles: n = max_unique_shuffles # create shuffles shuffle_list = [] for j in np.arange(n): shuffle = ''.join(more_itertools.random_permutation(shuffle_region)) # keep generating shuffles until a new one is obtained while shuffle in shuffle_list or shuffle == shuffle_region: shuffle = ''.join(more_itertools.random_permutation(shuffle_region)) # keep track of generated shuffles shuffle_list.append(shuffle) # append the full sequences and names shuffled_seq = pre_shuffle + shuffle.lower() + post_shuffle sequence_list.append(shuffled_seq) name = f'shuffle_size{window_size}_ss{step_size}_pos{window}_{int(j+1)}' name_list.append(name) return sequence_list, name_list