30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135 | def __init__(
self,
filter_gene_by_counts: Union[int, bool] = False,
filter_cell_by_counts: Union[int, bool] = False,
normalize_sum: float = 1e4,
n_hvg_for_postp: int = 0,
use_layer: Optional[str] = None,
is_symbol: bool = False,
hvg_flavor: str = "seurat_v3",
binning: Optional[int] = None,
result_binned_key: str = "X_binned",
length_normalize: bool = False,
force_preprocess: bool = False,
min_dataset_size: int = 100,
min_valid_genes_id: int = 10_000,
min_nnz_genes: int = 200,
maxdropamount: int = 50,
madoutlier: int = 5,
pct_mt_outlier: int = 8,
batch_keys: list[str] = [
"assay_ontology_term_id",
"self_reported_ethnicity_ontology_term_id",
"sex_ontology_term_id",
"donor_id",
"suspension_type",
],
skip_validate: bool = False,
additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
do_postp: bool = True,
organisms: list[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
use_raw: bool = True,
keepdata: bool = False,
) -> None:
"""
Initializes the preprocessor and configures the workflow steps.
Your dataset should contain at least the following obs:
- `organism_ontology_term_id` with the ontology id of the organism of your anndata
- gene names in the `var.index` field of your anndata that map to the ensembl_gene nomenclature
or the hugo gene symbols nomenclature (if the later, set `is_symbol` to True)
Args:
filter_gene_by_counts (int or bool, optional): Determines whether to filter genes by counts.
If int, filters genes with counts. Defaults to False.
filter_cell_by_counts (int or bool, optional): Determines whether to filter cells by counts.
If int, filters cells with counts. Defaults to False.
normalize_sum (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
Defaults to 1e4.
log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
Defaults to True.
n_hvg_for_postp (int or bool, optional): Determines whether to subset to highly variable genes for the PCA.
Defaults to False.
hvg_flavor (str, optional): Specifies the flavor of highly variable genes selection.
See :func:`scanpy.pp.highly_variable_genes` for more details. Defaults to "seurat_v3".
binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
result_binned_key (str, optional): Specifies the key of :class:`~anndata.AnnData` to store the binned data.
Defaults to "X_binned".
length_normalize (bool, optional): Determines whether to length normalize the data.
Defaults to False.
force_preprocess (bool, optional): Determines whether to bypass the check of raw counts.
Defaults to False.
min_dataset_size (int, optional): The minimum size required for a dataset to be kept.
Defaults to 100.
min_valid_genes_id (int, optional): The minimum number of valid genes to keep a dataset.
Defaults to 10_000.
min_nnz_genes (int, optional): The minimum number of non-zero genes to keep a cell.
Defaults to 200.
maxdropamount (int, optional): The maximum amount of dropped cells per dataset. (2 for 50% drop, 3 for 33% drop, etc.)
Defaults to 2.
madoutlier (int, optional): The maximum absolute deviation of the outlier samples.
Defaults to 5.
pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
Defaults to 8.
batch_key (str, optional): The key of :class:`~anndata.AnnData.obs` to use for batch information.
This arg is used in the highly variable gene selection step.
skip_validate (bool, optional): Determines whether to skip the validation step.
Defaults to False.
keepdata (bool, optional): Determines whether to keep the data in the AnnData object.
Defaults to False.
"""
self.filter_gene_by_counts = filter_gene_by_counts
self.filter_cell_by_counts = filter_cell_by_counts
self.normalize_sum = normalize_sum
self.hvg_flavor = hvg_flavor
self.binning = binning
self.organisms = organisms
self.result_binned_key = result_binned_key
self.additional_preprocess = additional_preprocess
self.additional_postprocess = additional_postprocess
self.force_preprocess = force_preprocess
self.min_dataset_size = min_dataset_size
self.min_valid_genes_id = min_valid_genes_id
self.min_nnz_genes = min_nnz_genes
self.maxdropamount = maxdropamount
self.madoutlier = madoutlier
self.n_hvg_for_postp = n_hvg_for_postp
self.pct_mt_outlier = pct_mt_outlier
self.batch_keys = batch_keys
self.length_normalize = length_normalize
self.skip_validate = skip_validate
self.use_layer = use_layer
self.is_symbol = is_symbol
self.do_postp = do_postp
self.use_raw = use_raw
self.keepdata = keepdata
|