Create the dataloader¶

In [1]:

Copied!

! lamin load scdataloader
! lamin load scdataloader

💡 found cached instance metadata: /home/ml4ig1/.lamin/instance--jkobject--scdataloader.env
💡 loaded instance: jkobject/scdataloader
💡 loaded instance: jkobject/scdataloader

In [2]:

Copied!

import tqdm

from scdataloader import DataModule

%load_ext autoreload
%autoreload 2
import tqdm

from scdataloader import DataModule

%load_ext autoreload
%autoreload 2

💡 lamindb instance: jkobject/scdataloader

In [ ]:

Copied!





datamodule = DataModule(
    collection_name="preprocessed dataset",
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    batch_size=64,
    num_workers=1,
    validation_split=0.1,
    test_split=0)
datamodule = DataModule(
    collection_name="preprocessed dataset",
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    batch_size=64,
    num_workers=1,
    validation_split=0.1,
    test_split=0)

or can be a much more complex dataloader too!¶

In [3]:

Copied!





hierarchical_labels = [
    "cell_type_ontology_term_id",
    #"tissue_ontology_term_id"
    "disease_ontology_term_id",
    #"development_stage_ontology_term_id",
    "assay_ontology_term_id",
    'self_reported_ethnicity_ontology_term_id',
]
labels_to_pred = hierarchical_labels+[
    'sex_ontology_term_id',
    "organism_ontology_term_id",
]
all_labels = labels_to_pred+[
    #'dataset_id',
    #'cell_culture',
    "heat_diff",
    "total_counts",
    "nnz",
    "dpt_group",
]

name="preprocessed dataset"
hierarchical_labels = [
    "cell_type_ontology_term_id",
    #"tissue_ontology_term_id"
    "disease_ontology_term_id",
    #"development_stage_ontology_term_id",
    "assay_ontology_term_id",
    'self_reported_ethnicity_ontology_term_id',
]
labels_to_pred = hierarchical_labels+[
    'sex_ontology_term_id',
    "organism_ontology_term_id",
]
all_labels = labels_to_pred+[
    #'dataset_id',
    #'cell_culture',
    "heat_diff",
    "total_counts",
    "nnz",
    "dpt_group",
]

name="preprocessed dataset"

data loader¶

to create the dataloader we need a lamindb dataset. Here we take the one that we created in the previous notebook, but it can be another dataset like the lamin's cellxgene dataset.

example:

dataset = ln.Collection.using("laminlabs/cellxgene").one()

In [5]:

Copied!





datamodule = DataModule(
    collection_name="preprocessed dataset",
    all_labels=all_labels, #all the labels to query in the obs field
    hierarchical_labels=hierarchical_labels, #labels that can benefit from ontological hierarchies 
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    add_zero_genes=100, #some additional zeros will be given
    label_to_weight=labels_to_pred, # for weighted random sampling
    label_to_pred=labels_to_pred,
    batch_size=64,
    num_workers=1,
    validation_split=0.2,
    test_split=0)

# we setup the datamodule (as exemplified in lightning's good practices, but there might be some things to improve here)
testfiles = datamodule.setup() 
datamodule = DataModule(
    collection_name="preprocessed dataset",
    all_labels=all_labels, #all the labels to query in the obs field
    hierarchical_labels=hierarchical_labels, #labels that can benefit from ontological hierarchies 
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    add_zero_genes=100, #some additional zeros will be given
    label_to_weight=labels_to_pred, # for weighted random sampling
    label_to_pred=labels_to_pred,
    batch_size=64,
    num_workers=1,
    validation_split=0.2,
    test_split=0)

# we setup the datamodule (as exemplified in lightning's good practices, but there might be some things to improve here)
testfiles = datamodule.setup() 

won't do any check but we recommend to have your dataset coming from local storage

100.0% are aligned
total dataset size is 0.917606818 Gb
---
dataset contains:
     23349 cells
     70116 genes
     10 labels
     1 organisms
dataset contains 40 classes to predict

downloading gene names from biomart
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name']
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name']
reduced the size to 0.6722574020195106

In [7]:

Copied!





for i in tqdm.tqdm(datamodule.train_dataloader()):
    # pass #or do pass
    print(i)
    break
for i in tqdm.tqdm(datamodule.train_dataloader()):
    # pass #or do pass
    print(i)
    break

  0%|          | 0/292 [00:00<?, ?it/s]

{'x': tensor([[ 78.,   6.,   6.,  ...,   0.,   0.,   0.],
        [141.,  75.,  58.,  ...,   0.,   0.,   0.],
        [309.,  50.,  31.,  ...,   0.,   0.,   0.],
        ...,
        [157., 108.,  79.,  ...,   0.,   0.,   1.],
        [303., 123.,  70.,  ...,   0.,   0.,   0.],
        [136.,  29.,  22.,  ...,   0.,   0.,   0.]]), 'genes': tensor([[41514,   725,  9560,  ..., 23989, 20098, 39181],
        [41514, 15694,  9164,  ..., 47038, 10040, 54239],
        [41514, 16072, 12461,  ..., 59205, 16411, 67531],
        ...,
        [41514,  1583,  8960,  ..., 62974, 57751, 14310],
        [41514, 13107,  9164,  ..., 20352, 32101,  9779],
        [41514, 15694,   409,  ..., 50807, 36053, 38710]], dtype=torch.int32), 'class': tensor([[ 2,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [10,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0,  0],
        [10,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0],
        [ 1,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [11,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [10,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0]], dtype=torch.int32), 'tp': tensor([1.2580e-03, 4.4219e-02, 3.0407e-04, 1.3363e-02, 5.3897e-05, 7.4358e-02,
        1.6140e-04, 2.5554e-02, 5.0249e-02, 1.0121e-02, 1.9645e-02, 3.8997e-02,
        2.7244e-04, 3.5064e-04, 1.6611e-03, 2.9493e-04, 6.1022e-03, 2.9618e-04,
        2.9830e-04, 1.0688e-02, 9.9674e-02, 8.6086e-02, 1.4521e-02, 4.0110e-02,
        1.9823e-02, 9.0700e-03, 3.5943e-02, 4.2530e-03, 4.3240e-02, 2.6298e-03,
        3.0275e-04, 2.4445e-02, 7.9859e-03, 2.3292e-04, 2.0356e-02, 1.8703e-02,
        3.1378e-04, 6.5560e-02, 1.5749e-01, 9.5593e-02, 1.0728e-01, 1.3018e-02,
        3.5483e-02, 1.1571e-02, 3.3617e-02, 1.3363e-02, 3.1799e-02, 3.3795e-02,
        1.1277e-01, 2.0618e-04, 1.4773e-04, 2.7142e-04, 1.7224e-01, 1.7291e-04,
        1.5910e-04, 4.7466e-03, 1.1477e-04, 7.6637e-02, 6.4210e-02, 3.8356e-03,
        9.0700e-03, 1.3018e-02, 3.1949e-02, 2.9733e-04]), 'depth': tensor([ 1149.,  6478.,  4444.,  3980.,  6850.,  2841.,  5844.,  2151.,  5164.,
         5571.,  3609.,  3607.,  4621.,  3708.,  4482.,  2663., 23807.,  3917.,
         4050.,  2161.,  6427.,  2605.,  2266.,  2034.,  9118.,  2563.,  1504.,
         1601.,  2837.,  1645.,  4130.,  8535., 15514.,  2105.,  3051.,  1500.,
         3049.,  2328.,  8889.,  2762.,  5223.,  6030.,  1138.,  1702.,  4462.,
         3980.,  7410.,  7727.,  4714.,  2563.,  1984.,  5015.,  9833.,  1977.,
        10714.,  1554.,  3896.,  6008.,  2098.,  1203.,  2563.,  6030.,  7775.,
         2645.])}

  0%|          | 0/292 [00:03<?, ?it/s]

In [ ]:

Copied!

# .. 
# with lightning:
# Trainer(model, datamodule)
# .. 
# with lightning:
# Trainer(model, datamodule)

In [ ]:

Copied!

# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model 
# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model