"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

December 11, 2019

Day #301 - Data Batching in Keras

This post is about custom data batching using Keras. Here we override the methods of inbuilt sequence. The below example is with dummy data generation, data splitting and fetching the batch of records.

#Generate dummy data
import pandas as pd
import numpy as np
#Generate 250 Records and Split into 6 Columns
df = pd.DataFrame(np.random.randint(0,100,size=(250, 6)), columns=list(['X1','X2','X3','X4','Y1','Y2']))
print(df.head())
print(df.count())
#Split into X and Y
X = df[['X1','X2','X3','X4']]
Y = df[['Y1','Y2']]
print(X.head())
print(Y.head())
#Split data into train and test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
import keras
import math
class Generator(keras.utils.Sequence):
# Class is a dataset wrapper for better training performance
def __init__(self, x_set, y_set, batch_size, datacount):
self.x = x_set
self.y = y_set
self.batch_size = batch_size
self.indices = np.arange(self.x.shape[0])
self.idx = 0
self.datacount = datacount
def __len__(self):
print('length')
print(math.ceil(self.datacount/ self.batch_size))
return math.ceil(self.datacount/ self.batch_size)
def __getitem__(self, idx):
print('idx')
print(idx)
i1 = idx*self.batch_size
i2 = (idx+1)*self.batch_size
print('Start-' + str(i1) + '- End-' + str(i2))
if(i2 > self.datacount):
i2 = self.datacount
batch_x = self.x[i1:i2]
batch_y = self.y[i1:i2]
return batch_x, batch_y
def on_epoch_end(self):
np.random.shuffle(self.indices)
batch_size = 10
print('x_train')
print(x_train['X1'].count())
print('x_test')
print(x_test['X1'].count())
training_generator = Generator(x_train, y_train, batch_size, x_train['X1'].count())
validation_generator = Generator(x_test, y_test, batch_size, x_test['X1'].count())
print('training_generator')
#Compute batches for training data
num_batches_train = x_train['X1'].count()/batch_size
for batch_id in range(int(num_batches_train)):
print('batch_id')
print(batch_id)
print(training_generator.__getitem__(batch_id))
print('validation_generator')
#Compute batches for test data
num_batches_test = x_test['X1'].count()/batch_size
for batch_id in range(int(num_batches_test)):
print('batch_id')
print(batch_id)
print(validation_generator.__getitem__(batch_id))
#Model Architecture, Layers, Compile
#Model fitgenerator
#Model Save Checkpoint


Other strategies
  • Databases -> CSV 50K Data Chunks Records -> Training and Save Checkpoint
  • Checkpoint to Save for Each run and reuse for next 50K Chunk of Data
This is a classic data fetching solution. Database can store millions of records. We can fetch each batch, export it to a CSV and use each chunk, train and save checkpoint and continue further for next run.

Happy Learning!!!

No comments: