In [90]:
import torch
from torch.utils.data import Dataset, DataLoader
import time # To demonstrate performance implications of num_workers

In [91]:

class MyCustomDataset(Dataset):
    def __init__(self, num_samples=1000, feature_dim=10, transform=None):

        self.data = torch.linspace(0, num_samples-1, steps=num_samples).repeat(feature_dim).view(feature_dim, num_samples).t() # just repeat the number of the sample feature_dim times
        self.labels = torch.randint(0, 2, (num_samples,)) # Synthetic binary labels (0 or 1)
        self.transform = transform
        self.num_samples = num_samples

    def __len__(self):

        return self.num_samples

    def __getitem__(self, idx):

        sample_data = self.data[idx]
        sample_label = self.labels[idx]

        if self.transform:
            sample_data = self.transform(sample_data)

        return sample_data, sample_label


In [92]:
dataset = MyCustomDataset(num_samples=100000, feature_dim=4)

In [93]:
dataset[0]

(tensor([0., 0., 0., 0.]), tensor(1))

In [94]:
dataset[1]

(tensor([1., 1., 1., 1.]), tensor(1))

In [95]:
dataset[131]

(tensor([131., 131., 131., 131.]), tensor(0))

In [96]:
batch_size = 16 # Try changing this
shuffle = True    # Try setting to False
num_workers = 0   # Try 0, then 2 or 4 if you have multiple cores, to see effect
pin_memory = torch.cuda.is_available() # Only pin memory if a GPU is available

print(f"DataLoader settings: batch_size={batch_size}, shuffle={shuffle}, num_workers={num_workers}, pin_memory={pin_memory}")

data_loader = DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers,
    pin_memory=pin_memory,
    drop_last=False # Usually False for validation/test, can be True for training
)

DataLoader settings: batch_size=16, shuffle=True, num_workers=0, pin_memory=False


In [97]:
batch = next(iter(data_loader))
batch

[tensor([[33116., 33116., 33116., 33116.],
         [27729., 27729., 27729., 27729.],
         [ 8221.,  8221.,  8221.,  8221.],
         [46026., 46026., 46026., 46026.],
         [ 9590.,  9590.,  9590.,  9590.],
         [ 5832.,  5832.,  5832.,  5832.],
         [21644., 21644., 21644., 21644.],
         [ 8384.,  8384.,  8384.,  8384.],
         [84184., 84184., 84184., 84184.],
         [68663., 68663., 68663., 68663.],
         [56310., 56310., 56310., 56310.],
         [ 5421.,  5421.,  5421.,  5421.],
         [85879., 85879., 85879., 85879.],
         [65818., 65818., 65818., 65818.],
         [ 3886.,  3886.,  3886.,  3886.],
         [99856., 99856., 99856., 99856.]]),
 tensor([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0])]

Dataloader takes care of batching, shuffling, and loading the data in parallel using multiprocessing workers.


Before training the model, check how fast the data is loaded by iterating through the dataloader and measuring the time taken for a few iterations.



In [98]:
start_time = time.time()

for batch_idx, (batch_data, batch_labels) in enumerate(data_loader):
    
    continue # just do nothing for now

    # Here, you would typically pass batch_data to your model, calculate loss, etc.
    # model(batch_data)
    # loss = criterion(output, batch_labels)
    # ...

# You can try timing the loop with different num_workers to see the impact
# For this tiny synthetic dataset, the difference might be negligible or even
# negative due to overhead. For datasets involving disk I/O, num_workers is key.
if num_workers > 0 :
    print(f"Time taken for epoch with {num_workers} workers: {time.time() - start_time:.4f} seconds")
else:
    print(f"Time taken for epoch with main process loading: {time.time() - start_time:.4f} seconds")



Time taken for epoch with main process loading: 0.4101 seconds


In [99]:
%%time

for batch_idx, (batch_data, batch_labels) in enumerate(data_loader):
    
    continue # just do nothing for now


CPU times: user 1.23 s, sys: 208 ms, total: 1.44 s
Wall time: 455 ms


Examples with comments.

In [2]:
# 1. Define a Custom Dataset
# ---------------------------
# Your custom dataset must inherit from torch.utils.data.Dataset
# and implement __len__ and __getitem__.

class MyCustomDataset(Dataset):
    """
    A simple custom dataset that generates synthetic data.
    In a real-world scenario, this is where you would load your actual data,
    e.g., file paths to images, lines from a text file, rows from a CSV, etc.
    """
    def __init__(self, num_samples=1000, feature_dim=10, transform=None):
        """
        Args:
            num_samples (int): Total number of samples in the dataset.
            feature_dim (int): Dimensionality of each data sample.
            transform (callable, optional): Optional transform to be applied on a sample.
                                           (e.g., data augmentation for images)
        """
        print(f"Initializing MyCustomDataset with {num_samples} samples.")
        # For this example, we'll create some random data and labels.
        # In a real dataset, self.data might be a list of file paths,
        # and self.labels might be loaded from a manifest file.
        self.data = torch.randn(num_samples, feature_dim)  # Synthetic data
        self.labels = torch.randint(0, 2, (num_samples,)) # Synthetic binary labels (0 or 1)
        self.transform = transform
        self.num_samples = num_samples

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        This is crucial for the DataLoader to know how many items it can fetch.
        """
        return self.num_samples

    def __getitem__(self, idx):
        """
        Retrieves the sample (data and label) at the given index `idx`.
        This method is called by the DataLoader to fetch individual items.

        Args:
            idx (int): The index of the item to retrieve.

        Returns:
            tuple: (sample, label) where sample is the data and label is its corresponding label.
        """
        # Retrieve the raw data and label
        sample_data = self.data[idx]
        sample_label = self.labels[idx]

        # Apply transformations if any are provided
        # (e.g., for image datasets, this could be normalization, resizing, augmentation)
        if self.transform:
            sample_data = self.transform(sample_data)

        # In more complex scenarios, `idx` might be used to:
        # - Load an image from `self.file_paths[idx]`
        # - Read a specific line from a large text file
        # - Query a database
        # The key is that it should return one processed data point and its label.
        return sample_data, sample_label


In [3]:
# 2. Instantiate the Custom Dataset
# ---------------------------------
print("\n--- Creating Dataset Instance ---")
custom_dataset = MyCustomDataset(num_samples=100, feature_dim=5)

# Let's test __len__ and __getitem__
print(f"Length of dataset: {len(custom_dataset)}")
sample_idx = 0
first_sample_data, first_sample_label = custom_dataset[sample_idx]
print(f"First sample (data at index {sample_idx}): {first_sample_data}")
print(f"First sample label (label at index {sample_idx}): {first_sample_label}")




--- Creating Dataset Instance ---
Initializing MyCustomDataset with 100 samples.
Length of dataset: 100
First sample (data at index 0): tensor([ 0.7572, -1.8577,  0.9837,  0.6581,  0.2992])
First sample label (label at index 0): 1


In [4]:
x,y = custom_dataset[10]
x.shape, y.shape

(torch.Size([5]), torch.Size([]))

In [5]:
# 3. Define the DataLoader
# ------------------------
# The DataLoader takes a Dataset and provides an iterable over it,
# handling batching, shuffling, and parallel data loading.
print("\n--- Creating DataLoader Instance ---")

# Important DataLoader parameters:
# - dataset: The Dataset object from which to load the data.
# - batch_size (int, optional, default=1): How many samples per batch to load.
# - shuffle (bool, optional, default=False): Set to True to have the data reshuffled
#   at every epoch (good for training to ensure batches are different each time).
# - num_workers (int, optional, default=0): How many subprocesses to use for data
#   loading. 0 means that the data will be loaded in the main process.
#   Increasing this can significantly speed up data loading by leveraging multiple
#   CPU cores, especially if __getitem__ involves I/O or heavy computation.
#   (BEWARE: On Windows, be careful with num_workers > 0, use if __name__ == '__main__':)
# - pin_memory (bool, optional, default=False): If True, the DataLoader will copy
#   tensors into CUDA pinned memory before returning them. This can speed up
#   data transfer from CPU to GPU. Typically used when training on a GPU.
# - drop_last (bool, optional, default=False): Set to True to drop the last incomplete
#   batch, if the dataset size is not divisible by the batch size. If False and
#   the size of dataset is not divisible by the batch_size, then the last batch
#   will be smaller.

batch_size_val = 16 # Try changing this
shuffle_val = True    # Try setting to False
num_workers_val = 0   # Try 0, then 2 or 4 if you have multiple cores, to see effect
pin_memory_val = torch.cuda.is_available() # Only pin memory if a GPU is available

print(f"DataLoader settings: batch_size={batch_size_val}, shuffle={shuffle_val}, num_workers={num_workers_val}, pin_memory={pin_memory_val}")

data_loader = DataLoader(
    dataset=custom_dataset,
    batch_size=batch_size_val,
    shuffle=shuffle_val,
    num_workers=num_workers_val,
    pin_memory=pin_memory_val,
    drop_last=False # Usually False for validation/test, can be True for training
)



--- Creating DataLoader Instance ---
DataLoader settings: batch_size=16, shuffle=True, num_workers=0, pin_memory=False


In [6]:
# 4. Iterate through the DataLoader (Example Usage)
# -------------------------------------------------
print("\n--- Iterating through DataLoader (1 epoch) ---")
# In a training loop, you would iterate over the data_loader for each epoch.
start_time = time.time()
for epoch in range(1): # Simulating one epoch
    print(f"\nEpoch {epoch+1}")
    for batch_idx, (batch_data, batch_labels) in enumerate(data_loader):
        # batch_data will be a tensor of shape (batch_size, feature_dim)
        # batch_labels will be a tensor of shape (batch_size)

        # If pin_memory=True and you have a GPU, data is already in pinned memory.
        # You would typically move data to the GPU here:
        # if torch.cuda.is_available():
        #     batch_data = batch_data.to('cuda')
        #     batch_labels = batch_labels.to('cuda')

        if batch_idx < 3: # Print first few batches
            print(f"  Batch {batch_idx+1}:")
            print(f"    Data shape: {batch_data.shape}")
            print(f"    Labels shape: {batch_labels.shape}")
            # print(f"    Sample data from batch: {batch_data[0]}") # First item in batch
            # print(f"    Sample label from batch: {batch_labels[0]}")

        # Here, you would typically pass batch_data to your model, calculate loss, etc.
        # model(batch_data)
        # loss = criterion(output, batch_labels)
        # ...

    # You can try timing the loop with different num_workers to see the impact
    # For this tiny synthetic dataset, the difference might be negligible or even
    # negative due to overhead. For datasets involving disk I/O, num_workers is key.
    if num_workers_val > 0 :
        print(f"Time taken for epoch with {num_workers_val} workers: {time.time() - start_time:.4f} seconds")
    else:
        print(f"Time taken for epoch with main process loading: {time.time() - start_time:.4f} seconds")




--- Iterating through DataLoader (1 epoch) ---

Epoch 1
  Batch 1:
    Data shape: torch.Size([16, 5])
    Labels shape: torch.Size([16])
  Batch 2:
    Data shape: torch.Size([16, 5])
    Labels shape: torch.Size([16])
  Batch 3:
    Data shape: torch.Size([16, 5])
    Labels shape: torch.Size([16])
Time taken for epoch with main process loading: 0.0033 seconds


In [7]:
for batch in data_loader:
    # batch is a tuple of (data, label)
    data, label = batch
    print(f"Data shape: {data.shape}, Label shape: {label.shape}")
    break  # Just to show the first batch

Data shape: torch.Size([16, 5]), Label shape: torch.Size([16])
