Source code for senid.intrinsic._intrinsic_analysis



from typing import Tuple
import monod
from monod import preprocess, extract_data, cme_toolbox, inference, analysis, mminference
from itertools import product
import anndata as ad 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def _get_log_fold_changes(sr1: inference.SearchResults,
                          sr2: inference.SearchResults,
                          sd1: extract_data.SearchData,
                          sd2: extract_data.SearchData,
                          dname1: str,
                          dname2: str,
                          gene_names: list[str],
                          gf_rej: bool = False,
                          param_lfc: float = 2.0,
                          mean_lfc: float = 1.0,
                          pval_thr: float = 0.05,
                          outlier_de: bool = True,
                          single_nuc: bool = False,
                          correct_off: bool = False):
    '''
    Utilize different metrics to find fold-changes (FCs) between clusters in different SRs

    sr1: SearchResults object 1 (single object)
    sr2: SearchResults object 2
    sd1: SearchData object for sr1
    sd2: SearchData object for sr2
    gene_names: list of gene names to compare between SRs
    gf_rej: whether to use boolean list of rejected genes from both SRs
    param_lfc: FC threshold value (to call DE-theta genes)
    mean_lfc: Mean S expression threshold value, for genes to consider
    outlier_de: Use iterative outlier calling procedure to assign DE-theta genes (see Monod https://github.com/pachterlab/monod_examples/blob/main/Monod_demo.ipynb)
    single_nuc: is this single_nuclear RNA data
    correct_off: boolean to correct parameter offset with ODR
    '''

    param_lfcs = pd.DataFrame()
    fcs,types,which_pair,highFC,spliceFC,g_names,out_de = ([] for i in range(7))

    sr1 = sr1
    sr2 = sr2

    gfilt1 = [list(sr1.gene_names).index(i) for i in gene_names]
    gfilt2 = [list(sr2.gene_names).index(i) for i in gene_names]

    par_vals1 = np.copy(sr1.param_estimates[:,gfilt1,:])
    par_vals2 = np.copy(sr2.param_estimates[:,gfilt2,:])

    parnames = ('b','beta','gamma')

    if correct_off:
        param_names = sr1.model.get_log_name_str()
        offsets = []
        for k in range(len(param_names)):
            m1 = par_vals1[0,:,k]
            m2 = par_vals2[0,:,k]
            offset = analysis.diffexp_fpi(m1,m2,param_names[k],viz=False, pval_thr=pval_thr)[1]
            par_vals2[0,:,k] -= offset

        fc_par = (par_vals1-par_vals2)/np.log10(2)
    else:
        fc_par = (par_vals1-par_vals2)/np.log10(2)  #Get FCs between cluster params

    print('fc_par.shape: ',fc_par.shape)
    if single_nuc:
        fc_s_par = np.log2((sd1.layers[0][gfilt1,:].mean(1) + 1e-12)/(sd2.layers[0][gfilt2,:].mean(1) + 1e-12)) #Get unspliced FCs
    else:
        fc_s_par = np.log2((sd1.layers[1][gfilt1,:].mean(1) + 1e-12)/(sd2.layers[1][gfilt2,:].mean(1) + 1e-12)) #Get spliced FCs

    print('fc_s_par.shape: ',fc_s_par.shape)

    #Use outlier calling method to find DE genes
    if outlier_de:
        if len(sr1.gene_names) != len(sr2.gene_names):
            print('Not running outlier DE. SRs need to have the same gene dimensions.')
            par_bool_de = np.zeros((len(gene_names),len(parnames)))
        else:
            dr_analysis = monod.analysis.diffexp_pars(sr1,sr2,viz=True,modeltype='id',use_sigma=True, pval_thr=pval_thr)
            par_bool_de = dr_analysis[1].T

  #-----is parameter FC significant -----
    if gf_rej is False:
        gf_rej = [True]*(len(gfilt1))
    else:
        gf_rej = ~(sr1.rejected_genes[gfilt1]) & ~(sr2.rejected_genes[gfilt2])


    for n in range(len(parnames)):
        #Boolean for if large param FC and not rejected gene (with minimum expression)
        if single_nuc:
            gf_highnoise = (np.abs(fc_par[0,:,n])>param_lfc)  \
                & ((sd1.layers[0][gfilt1,:].mean(1)>mean_lfc) | (sd2.layers[0][gfilt2,:].mean(1)>mean_lfc)) \
                & gf_rej
        else:
            gf_highnoise = (np.abs(fc_par[0,:,n])>param_lfc)  \
                & ((sd1.layers[1][gfilt1,:].mean(1)>mean_lfc) | (sd2.layers[1][gfilt2,:].mean(1)>mean_lfc)) \
                & gf_rej

        #Boolean for FC (above) but no FC detected at S-level
        gf_highnoise_meanS = gf_highnoise & (np.abs(fc_s_par)<mean_lfc) & gf_rej

        #Boolean for FC (above)
        gf_onlyhigh = gf_highnoise & gf_rej

        #For dataframe
        fcs += list(fc_par[0,gf_rej,n])
        g_names += list(gene_names[gf_rej])
        which_pair += [[dname1, dname2]]*np.sum(gf_rej)
        highFC += list(gf_onlyhigh[gf_rej])
        spliceFC += list(gf_highnoise_meanS[gf_rej])
        types += [parnames[n]]*np.sum(gf_rej)
        if outlier_de:
            out_de += list(par_bool_de[gf_rej,n])

    if outlier_de:
        param_lfcs['deTheta_outlier'] = out_de

    param_lfcs['log2FC'] = fcs
    param_lfcs['gene'] = g_names
    param_lfcs['cluster_pair'] = which_pair
    param_lfcs['deTheta_FC'] = highFC
    param_lfcs['deTheta_noDeMuS'] = spliceFC
    param_lfcs['param'] = types

    return param_lfcs

def _get_DE_results(param_lfcs: pd.DataFrame) -> pd.DataFrame:

    # Pre-filtering by parameter
    df_b = param_lfcs[param_lfcs.param == 'b'].reset_index(drop=True)
    df_beta = param_lfcs[param_lfcs.param == 'beta'].reset_index(drop=True)
    df_gamma = param_lfcs[param_lfcs.param == 'gamma'].reset_index(drop=True)

    # Extracting vectors
    bs = df_b.log2FC
    betas = df_beta.log2FC
    gammas = df_gamma.log2FC
    names = df_b.gene

    # Combining significance masks
    sig_b = df_b.deTheta_FC | df_b.deTheta_noDeMuS
    sig_beta = df_beta.deTheta_FC | df_beta.deTheta_noDeMuS
    sig_gamma = df_gamma.deTheta_FC | df_gamma.deTheta_noDeMuS

    highFCs = df_b.deTheta_FC.values | df_beta.deTheta_FC.values | df_gamma.deTheta_FC.values
    noSpliceFCs = df_b.deTheta_noDeMuS.values | df_beta.deTheta_noDeMuS.values | df_gamma.deTheta_noDeMuS.values

    dom = []
    dom_type = []

    # Prepare split cluster pairs
    pairs = df_b.cluster_pair.apply(lambda x: x if isinstance(x, list) else x.split(','))
    pairs = np.vstack(pairs.values)
    # pairs = df_b.cluster_pair.str.split(',').tolist()
    # pairs = np.array(pairs)

    # Precompute necessary comparisons
    abs_b = np.abs(bs)
    abs_beta = np.abs(betas)
    abs_gamma = np.abs(gammas)

    # Vectorised way of considering significance conditions
    cond1 = (-betas > 0) & sig_beta & (-gammas > 0) & sig_gamma # Possibly significant in k (sig. change in both beta and k)
    cond2 = (betas > 0) & sig_beta & (gammas > 0) & sig_gamma # Possibly significant in k (sig. change in both beta and k)
    cond3 = sig_b & (abs_b > abs_beta) & (abs_b > abs_gamma) # Sig in b
    cond4 = sig_b & (abs_b < abs_beta) & (abs_b >= abs_gamma) # Sig in b
    cond5 = sig_b & (abs_b < abs_gamma) & (abs_b >= abs_beta) # Sig in b
    cond6 = (betas > 0) & sig_beta
    cond7 = (gammas > 0) & sig_gamma # Sig in gamma
    cond8 = (betas < 0) & sig_beta
    cond9 = (gammas < 0) & sig_gamma # Sig in gamma

    # Initialize outputs
    dom = np.full(len(bs), 'None', dtype='<U50')
    dom_type = np.full(len(bs), 'None', dtype='<U50')

    # Apply conditions vectorized
    mask = cond1 & (abs_b < abs_beta)
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'frequency'

    mask = cond1 & (bs > 0) & sig_b
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'frequency'

    mask = cond1 & (bs < 0) & sig_b
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'frequency'

    mask = cond2 & (abs_b < abs_beta)
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'frequency'

    mask = cond2 & (bs < 0) & sig_b
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'frequency'

    mask = cond3 & (bs > 0)
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'synthesis'

    mask = cond3 & (bs < 0)
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'synthesis'

    mask = cond4 & (betas > 0)
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'synthesis'

    mask = cond4 & (betas < 0)
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'synthesis'

    mask = cond5 & (gammas < 0)
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'synthesis'

    mask = cond5 & (gammas > 0)
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'synthesis'

    # Beta/gamma individually if none of the above
    mask = cond6 & ~cond1 & ~cond2 & ~sig_b
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'splicing'

    mask = cond7 & ~cond1 & ~cond2 & ~sig_b & ~cond6
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'degradation'

    mask = cond8 & ~cond1 & ~cond2 & ~sig_b & ~cond6 & ~cond7
    dom[mask] = pairs[mask, 1]
    dom_type[mask] = 'splicing'

    mask = cond9 & ~cond1 & ~cond2 & ~sig_b & ~cond6 & ~cond7 & ~cond8
    dom[mask] = pairs[mask, 0]
    dom_type[mask] = 'degradation'

    # Approximate the log fold change in k using the beta and gamma log fold changes
    ks = -0.5 * (betas.values + gammas.values)

    markers_df = pd.DataFrame(data={'gene': names.values,
                              'synthesis': bs.values,
                                'splicing': -betas.values,
                                'degradation': -gammas.values,
                                'frequency': ks,
                                'DE': highFCs,
                                'DE-reg': noSpliceFCs,
                                'marker': dom,
                                'marker_type': dom_type,
                                })

    markers_df['marker'] = pd.Categorical(markers_df['marker'], categories=np.unique(pairs), ordered=False)
    markers_df['marker_type'] = pd.Categorical(markers_df['marker_type'],
                                   categories=['None', 'synthesis', 'frequency', 'splicing', 'degradation'],
                                   ordered=False)

    return markers_df

[docs] def prepare_files_for_inference(loom_directory: str, dataset_names: list[str], transcriptome_filepath: str ) -> Tuple[str, list[str]]: loom_filepaths = [f'{loom_directory}/{dname}.loom' for dname in dataset_names] adata = ad.read_loom(loom_filepaths[0]) genes_to_use = list(adata.var['gene_name'].values) n_genes = len(genes_to_use) return monod.preprocess.construct_batch(dataset_filepaths=loom_filepaths, transcriptome_filepath=transcriptome_filepath, dataset_names=dataset_names, attribute_names=[('unspliced','spliced'), 'gene_name', 'barcode'], meta='', creator='SenID', datestring='', exp_filter_threshold=None, n_genes=n_genes, genes_to_fit=genes_to_use)
[docs] def run_monod_inference(loom_directory: str, dataset_names: list[str], transcriptome_filepath: str, dir_string: str, dataset_strings: list[str], phys_lb: list[float], phys_ub: list[float], samp_lb: list[float], samp_ub: list[float], gridsize: list[int], max_iterations: int = 15, n_restarts: int = 1, n_jobs: int = 1) -> None: if len(phys_lb) != len(phys_ub) or len(samp_lb) != len(samp_ub): raise ValueError("Lower and upper bounds must have the same length.") if len(phys_lb) != 3: raise ValueError("Physical parameter bounds must have length 3.") if len(phys_ub) != 3: raise ValueError("Physical parameter bounds must have length 3.") if len(samp_lb) != 2: raise ValueError("Sampling parameter bounds must have length 2.") if len(samp_ub) != 2: raise ValueError("Sampling parameter bounds must have length 2.") if len(gridsize) != 2: raise ValueError("Grid size must have length 2.") n_datasets = len(dataset_names) loom_filepaths = [f'{loom_directory}/{dname}.loom' for dname in dataset_names] result_strings = [] for i in range(n_datasets): fitmodel = monod.cme_toolbox.CMEModel('Bursty','Poisson') # Bursty biological model, Poisson sampling model inference_parameters = monod.inference.InferenceParameters(phys_lb, phys_ub, samp_lb, samp_ub, gridsize, dataset_strings[i], fitmodel, use_lengths = True, gradient_params = {'max_iterations':max_iterations,'init_pattern':'moments','num_restarts':n_restarts}) search_data = monod.extract_data.extract_data(loom_filepaths[i], transcriptome_filepath, dataset_names[i], dataset_strings[i], dir_string, dataset_attr_names=[('unspliced','spliced'), 'gene_name', 'barcode']) full_result_string = inference_parameters.fit_all_grid_points(n_jobs, search_data) result_strings.append(full_result_string)
[docs] def process_monod_fits(sr_arr: list[inference.SearchResults], sd_arr: list[extract_data.SearchData], dir_string: str, plot_results: bool = True, n_jobs: int = 1) -> None: for j, sr in enumerate(sr_arr): sd = sd_arr[j] if plot_results: fig1,ax1 = plt.subplots(1,1) sr.find_sampling_optimum() sr.plot_landscape(ax1) fig1,ax1 = plt.subplots(1,1) sr.plot_KL(ax1) sr.plot_gene_distributions(sd,marg='joint') _=sr.chisquare_testing(sd) sr.resample_opt_viz() sr.resample_opt_mc_viz() sr.chisq_best_param_correction(sd,viz=True) sr.compute_sigma(sd,num_cores=n_jobs) #colab has a hard time with multiprocessing if plot_results: sr.plot_param_L_dep(plot_errorbars=True,plot_fit=True) sr.plot_param_marg() monod.analysis.make_batch_analysis_dir([sr],dir_string) sr.update_on_disk()
[docs] def get_monod_de_genes(sr1: inference.SearchResults, sr2: inference.SearchResults, sd1: extract_data.SearchData, sd2: extract_data.SearchData, dname1: str, dname2: str, gene_names: list[str], gf_rej: bool = False, param_lfc: float = 2.0, mean_lfc: float = 1.0, pval_thr: float = 0.05, outlier_de: bool = True, single_nuc: bool = False, correct_off: bool = False) -> pd.DataFrame: param_lfcs = _get_log_fold_changes(sr1=sr1, sr2=sr2, sd1=sd1, sd2=sd2, dname1=dname1, dname2=dname2, gene_names=gene_names, gf_rej=gf_rej, param_lfc=param_lfc, mean_lfc=mean_lfc, pval_thr=pval_thr, outlier_de=outlier_de, single_nuc=single_nuc, correct_off=correct_off) de_results = _get_DE_results(param_lfcs) return de_results