nb_estimates_states_singularity nb_estimates_states_singularity.py iterations get referencecell2loca

visium_heart/st_snRNAseq/04_deconvolution/nb_estimates_states_singularity.py at 5b30c7e497e06688a8448afd8d069d2fa70ebcd2 · saezlab/visium_heart (github.com)

visium_heart/st_snRNAseq/04_deconvolution/nb_to_csvs.py at 5b30c7e497e06688a8448afd8d069d2fa70ebcd2 · saezlab/visium_heart (github.com)

help understand understanding

adata_ref.obs['Subset'].value_counts()

sc_data=adata_ref
# remove cells and genes with 0 counts everywhere
sc.pp.filter_cells(sc_data, min_genes=1)
sc.pp.filter_genes(sc_data, min_cells=1)

# calculate the mean of each gene across non-zero cells
sc_data.var['n_cells'] = (sc_data.X.toarray() > 0).sum(0)
sc_data.var['nonz_mean'] = sc_data.X.toarray().sum(0) / sc_data.var['n_cells']
sc_data.var['SYMBOL'] = sc_data.var.index.tolist()



nonz_mean_cutoff = 0.05
cell_count_cutoff = np.log10(sc_data.shape[0] * 0.0005)
cell_count_cutoff2 = np.log10(sc_data.shape[0] * 0.03)

sc_data[:,(np.array(np.log10(sc_data.var['nonz_mean']) > nonz_mean_cutoff)
         | np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff2))
       & amp; np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff)].shape

# select genes based on mean expression in non-zero cells
sc_data = sc_data[:,(np.array(np.log10(sc_data.var['nonz_mean']) > nonz_mean_cutoff)
         | np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff2))
       & amp; np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff)]

sc_data.raw = sc_data

sc_data.obs[["new.cluster.idents"]].values
sc_data.obs["new.cluster.idents"].values

sc_data.obs['cell_type']=sc_data.obs["new.cluster.idents"].values

# Here we generate iterations to make a subsample
perc_filter = 0.4
cell_types = np.unique(sc_data.obs[["cell_type"]].values)
# Here we generate the number of cells per cell_type
cell_types_n = {ct: np.sum(ct == sc_data.obs[["cell_type"]].values) for ct in cell_types}
# Here we define the number of cells to subsample
cell_types_sample = {i: int(np.floor(cell_types_n[i] * perc_filter)) for i in cell_types_n}


from numpy.random import default_rng

iters = list(range(1,6))




for i in iters:
    
    rng = default_rng(seed=i)
    
    barcodes = []
    
    for ct in cell_types_sample:
        ct_barcodes = sc_data.obs[sc_data.obs['cell_type'] == ct].index
        nsample = cell_types_sample[ct]
        sample_barcodes = np.random.choice(ct_barcodes, size = nsample, replace = False)
        barcodes.extend(sample_barcodes)
    
    sampled_ct_data = sc_data[barcodes,]
    print(sampled_ct_data.obs.describe())

c2l_nb_states_gpu_singularity.sh
#!/bin/bash

#SBATCH --job-name=nbsingularity
#SBATCH -t 1000:00
#SBATCH [email protected]
#SBATCH --mail-type=END
#SBATCH --output /net/data.isilon/ag-saez/bq_rramirez/MI_deconvolution/jobs/c2l_nb_states_gpu_singularity.out

module load system/singularity;

cd /net/data.isilon/ag-saez/bq_rramirez/MI_deconvolution/;

singularity exec --nv -B \
/net/data.isilon/ag-saez/bq_rramirez/MI_deconvolution/ \
/net/data.isilon/ag-saez/bq_rramirez/MI_deconvolution/cell2location-v0.05-alpha.sif \
/bin/bash -c \
"python ./scripts/nb_estimates_states_singularity.py";

The nb_estimates_states_singularity.py script is as follows

#!/usr/bin/env python
# coding: utf-8

importsys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc

data_type = 'float32'
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True'

import cell2location
import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')

# read data

sc_data = sc.read("./integrated_rnasamples_ann.h5ad")

# remove cells and genes with 0 counts everywhere
sc.pp.filter_cells(sc_data, min_genes=1)
sc.pp.filter_genes(sc_data, min_cells=1)

# calculate the mean of each gene across non-zero cells
sc_data.var['n_cells'] = (sc_data.X.toarray() > 0).sum(0)
sc_data.var['nonz_mean'] = sc_data.X.toarray().sum(0) / sc_data.var['n_cells']
sc_data.var['SYMBOL'] = sc_data.var.index.tolist()

nonz_mean_cutoff = 0.05
cell_count_cutoff = np.log10(sc_data.shape[0] * 0.0005)
cell_count_cutoff2 = np.log10(sc_data.shape[0] * 0.03)

sc_data[:,(np.array(np.log10(sc_data.var['nonz_mean']) > nonz_mean_cutoff)
         | np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff2))
       & amp; np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff)].shape

# select genes based on mean expression in non-zero cells
sc_data = sc_data[:,(np.array(np.log10(sc_data.var['nonz_mean']) > nonz_mean_cutoff)
         | np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff2))
       & amp; np.array(np.log10(sc_data.var['n_cells']) > cell_count_cutoff)]

sc_data.raw = sc_data

# Here we generate iterations to make a subsample
perc_filter = 0.4
cell_types = np.unique(sc_data.obs[["cell_type"]].values)
# Here we generate the number of cells per cell_type
cell_types_n = {ct: np.sum(ct == sc_data.obs[["cell_type"]].values) for ct in cell_types}
# Here we define the number of cells to subsample
cell_types_sample = {i: int(np.floor(cell_types_n[i] * perc_filter)) for i in cell_types_n}

from numpy.random import default_rng

iters = list(range(1,6))

for i in iters:
    
    rng = default_rng(seed=i)
    
    barcodes = []
    
    for ct in cell_types_sample:
        ct_barcodes = sc_data.obs[sc_data.obs['cell_type'] == ct].index
        nsample = cell_types_sample[ct]
        sample_barcodes = np.random.choice(ct_barcodes, size = nsample, replace = False)
        barcodes.extend(sample_barcodes)
    
    sampled_ct_data = sc_data[barcodes,]

    from cell2location import run_regression

    r, adata_sc_data = run_regression(sampled_ct_data, # input data object]

                       verbose=True, return_all=True,

                       train_args={
                        'covariate_col_names': ['cell_type'], # column listing cell type annotation
                        'sample_name_col': 'orig_ident', # column listing sample ID for each cell
                        'tech_name_col': 'batch',
                        'stratify_cv': 'cell_type', # stratify cross-validation by cell type annotation
                        'n_epochs': 100,
                        'minibatch_size': 1024,
                        'learning_rate': 0.01,
                        'use_cuda': True, # use GPU?
                        'train_proportion': 0.9, # proportion of cells in the training set (for cross-validation)
                        'l2_weight': True, # uses defaults for the model

                        'readable_var_name_col': 'SYMBOL', 'use_raw': True},

                       model_kwargs={}, # keep defaults
                       posterior_args={}, # keep defaults

                       export_args={'path': './nb_estimates/', # where to save results
                                    'save_model': True, # save pytorch model?
                                    'run_name_suffix': '_celltypes' + str(i)})

    reg_mod = r['mod']

The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeHomepageOverview 374705 people are learning the system