Create the dataloader¶
In [1]:
Copied!
! lamin load scdataloader
! lamin load scdataloader
💡 found cached instance metadata: /home/ml4ig1/.lamin/instance--jkobject--scdataloader.env 💡 loaded instance: jkobject/scdataloader 💡 loaded instance: jkobject/scdataloader
In [2]:
Copied!
import tqdm
from scdataloader import DataModule
%load_ext autoreload
%autoreload 2
import tqdm
from scdataloader import DataModule
%load_ext autoreload
%autoreload 2
💡 lamindb instance: jkobject/scdataloader
In [ ]:
Copied!
datamodule = DataModule(
collection_name="preprocessed dataset",
organisms=["NCBITaxon:9606"], #organism that we will work on
how="most expr", # for the collator (most expr genes only will be selected)
max_len=1000, # only the 1000 most expressed
batch_size=64,
num_workers=1,
validation_split=0.1,
test_split=0)
datamodule = DataModule(
collection_name="preprocessed dataset",
organisms=["NCBITaxon:9606"], #organism that we will work on
how="most expr", # for the collator (most expr genes only will be selected)
max_len=1000, # only the 1000 most expressed
batch_size=64,
num_workers=1,
validation_split=0.1,
test_split=0)
or can be a much more complex dataloader too!¶
In [3]:
Copied!
hierarchical_labels = [
"cell_type_ontology_term_id",
#"tissue_ontology_term_id"
"disease_ontology_term_id",
#"development_stage_ontology_term_id",
"assay_ontology_term_id",
'self_reported_ethnicity_ontology_term_id',
]
labels_to_pred = hierarchical_labels+[
'sex_ontology_term_id',
"organism_ontology_term_id",
]
all_labels = labels_to_pred+[
#'dataset_id',
#'cell_culture',
"heat_diff",
"total_counts",
"nnz",
"dpt_group",
]
name="preprocessed dataset"
hierarchical_labels = [
"cell_type_ontology_term_id",
#"tissue_ontology_term_id"
"disease_ontology_term_id",
#"development_stage_ontology_term_id",
"assay_ontology_term_id",
'self_reported_ethnicity_ontology_term_id',
]
labels_to_pred = hierarchical_labels+[
'sex_ontology_term_id',
"organism_ontology_term_id",
]
all_labels = labels_to_pred+[
#'dataset_id',
#'cell_culture',
"heat_diff",
"total_counts",
"nnz",
"dpt_group",
]
name="preprocessed dataset"
data loader¶
to create the dataloader we need a lamindb dataset. Here we take the one that we created in the previous notebook, but it can be another dataset like the lamin's cellxgene dataset.
example:
dataset = ln.Collection.using("laminlabs/cellxgene").one()
In [5]:
Copied!
datamodule = DataModule(
collection_name="preprocessed dataset",
all_labels=all_labels, #all the labels to query in the obs field
hierarchical_labels=hierarchical_labels, #labels that can benefit from ontological hierarchies
organisms=["NCBITaxon:9606"], #organism that we will work on
how="most expr", # for the collator (most expr genes only will be selected)
max_len=1000, # only the 1000 most expressed
add_zero_genes=100, #some additional zeros will be given
label_to_weight=labels_to_pred, # for weighted random sampling
label_to_pred=labels_to_pred,
batch_size=64,
num_workers=1,
validation_split=0.2,
test_split=0)
# we setup the datamodule (as exemplified in lightning's good practices, but there might be some things to improve here)
testfiles = datamodule.setup()
datamodule = DataModule(
collection_name="preprocessed dataset",
all_labels=all_labels, #all the labels to query in the obs field
hierarchical_labels=hierarchical_labels, #labels that can benefit from ontological hierarchies
organisms=["NCBITaxon:9606"], #organism that we will work on
how="most expr", # for the collator (most expr genes only will be selected)
max_len=1000, # only the 1000 most expressed
add_zero_genes=100, #some additional zeros will be given
label_to_weight=labels_to_pred, # for weighted random sampling
label_to_pred=labels_to_pred,
batch_size=64,
num_workers=1,
validation_split=0.2,
test_split=0)
# we setup the datamodule (as exemplified in lightning's good practices, but there might be some things to improve here)
testfiles = datamodule.setup()
won't do any check but we recommend to have your dataset coming from local storage
100.0% are aligned total dataset size is 0.917606818 Gb --- dataset contains: 23349 cells 70116 genes 10 labels 1 organisms dataset contains 40 classes to predict downloading gene names from biomart ['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name'] ['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name'] reduced the size to 0.6722574020195106
In [7]:
Copied!
for i in tqdm.tqdm(datamodule.train_dataloader()):
# pass #or do pass
print(i)
break
for i in tqdm.tqdm(datamodule.train_dataloader()):
# pass #or do pass
print(i)
break
0%| | 0/292 [00:00<?, ?it/s]
{'x': tensor([[ 78., 6., 6., ..., 0., 0., 0.], [141., 75., 58., ..., 0., 0., 0.], [309., 50., 31., ..., 0., 0., 0.], ..., [157., 108., 79., ..., 0., 0., 1.], [303., 123., 70., ..., 0., 0., 0.], [136., 29., 22., ..., 0., 0., 0.]]), 'genes': tensor([[41514, 725, 9560, ..., 23989, 20098, 39181], [41514, 15694, 9164, ..., 47038, 10040, 54239], [41514, 16072, 12461, ..., 59205, 16411, 67531], ..., [41514, 1583, 8960, ..., 62974, 57751, 14310], [41514, 13107, 9164, ..., 20352, 32101, 9779], [41514, 15694, 409, ..., 50807, 36053, 38710]], dtype=torch.int32), 'class': tensor([[ 2, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [12, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [12, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [10, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 2, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 2, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 2, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 8, 0, 0, 0, 0, 0], [ 8, 0, 0, 0, 0, 0], [ 8, 0, 0, 0, 0, 0], [10, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [ 2, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0], [12, 0, 0, 0, 0, 0], [ 1, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [ 8, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 8, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0], [ 3, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [11, 0, 0, 0, 0, 0], [ 2, 0, 0, 0, 0, 0], [ 9, 0, 0, 0, 0, 0], [10, 0, 0, 0, 0, 0], [ 7, 0, 0, 0, 0, 0], [ 4, 0, 0, 0, 0, 0]], dtype=torch.int32), 'tp': tensor([1.2580e-03, 4.4219e-02, 3.0407e-04, 1.3363e-02, 5.3897e-05, 7.4358e-02, 1.6140e-04, 2.5554e-02, 5.0249e-02, 1.0121e-02, 1.9645e-02, 3.8997e-02, 2.7244e-04, 3.5064e-04, 1.6611e-03, 2.9493e-04, 6.1022e-03, 2.9618e-04, 2.9830e-04, 1.0688e-02, 9.9674e-02, 8.6086e-02, 1.4521e-02, 4.0110e-02, 1.9823e-02, 9.0700e-03, 3.5943e-02, 4.2530e-03, 4.3240e-02, 2.6298e-03, 3.0275e-04, 2.4445e-02, 7.9859e-03, 2.3292e-04, 2.0356e-02, 1.8703e-02, 3.1378e-04, 6.5560e-02, 1.5749e-01, 9.5593e-02, 1.0728e-01, 1.3018e-02, 3.5483e-02, 1.1571e-02, 3.3617e-02, 1.3363e-02, 3.1799e-02, 3.3795e-02, 1.1277e-01, 2.0618e-04, 1.4773e-04, 2.7142e-04, 1.7224e-01, 1.7291e-04, 1.5910e-04, 4.7466e-03, 1.1477e-04, 7.6637e-02, 6.4210e-02, 3.8356e-03, 9.0700e-03, 1.3018e-02, 3.1949e-02, 2.9733e-04]), 'depth': tensor([ 1149., 6478., 4444., 3980., 6850., 2841., 5844., 2151., 5164., 5571., 3609., 3607., 4621., 3708., 4482., 2663., 23807., 3917., 4050., 2161., 6427., 2605., 2266., 2034., 9118., 2563., 1504., 1601., 2837., 1645., 4130., 8535., 15514., 2105., 3051., 1500., 3049., 2328., 8889., 2762., 5223., 6030., 1138., 1702., 4462., 3980., 7410., 7727., 4714., 2563., 1984., 5015., 9833., 1977., 10714., 1554., 3896., 6008., 2098., 1203., 2563., 6030., 7775., 2645.])}
0%| | 0/292 [00:03<?, ?it/s]
In [ ]:
Copied!
# ..
# with lightning:
# Trainer(model, datamodule)
# ..
# with lightning:
# Trainer(model, datamodule)
In [ ]:
Copied!
# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model
# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model