This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Generate dummy data | |
import pandas as pd | |
import numpy as np | |
#Generate 250 Records and Split into 6 Columns | |
df = pd.DataFrame(np.random.randint(0,100,size=(250, 6)), columns=list(['X1','X2','X3','X4','Y1','Y2'])) | |
print(df.head()) | |
print(df.count()) | |
#Split into X and Y | |
X = df[['X1','X2','X3','X4']] | |
Y = df[['Y1','Y2']] | |
print(X.head()) | |
print(Y.head()) | |
#Split data into train and test | |
from sklearn.cross_validation import train_test_split | |
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2) | |
import keras | |
import math | |
class Generator(keras.utils.Sequence): | |
# Class is a dataset wrapper for better training performance | |
def __init__(self, x_set, y_set, batch_size, datacount): | |
self.x = x_set | |
self.y = y_set | |
self.batch_size = batch_size | |
self.indices = np.arange(self.x.shape[0]) | |
self.idx = 0 | |
self.datacount = datacount | |
def __len__(self): | |
print('length') | |
print(math.ceil(self.datacount/ self.batch_size)) | |
return math.ceil(self.datacount/ self.batch_size) | |
def __getitem__(self, idx): | |
print('idx') | |
print(idx) | |
i1 = idx*self.batch_size | |
i2 = (idx+1)*self.batch_size | |
print('Start-' + str(i1) + '- End-' + str(i2)) | |
if(i2 > self.datacount): | |
i2 = self.datacount | |
batch_x = self.x[i1:i2] | |
batch_y = self.y[i1:i2] | |
return batch_x, batch_y | |
def on_epoch_end(self): | |
np.random.shuffle(self.indices) | |
batch_size = 10 | |
print('x_train') | |
print(x_train['X1'].count()) | |
print('x_test') | |
print(x_test['X1'].count()) | |
training_generator = Generator(x_train, y_train, batch_size, x_train['X1'].count()) | |
validation_generator = Generator(x_test, y_test, batch_size, x_test['X1'].count()) | |
print('training_generator') | |
#Compute batches for training data | |
num_batches_train = x_train['X1'].count()/batch_size | |
for batch_id in range(int(num_batches_train)): | |
print('batch_id') | |
print(batch_id) | |
print(training_generator.__getitem__(batch_id)) | |
print('validation_generator') | |
#Compute batches for test data | |
num_batches_test = x_test['X1'].count()/batch_size | |
for batch_id in range(int(num_batches_test)): | |
print('batch_id') | |
print(batch_id) | |
print(validation_generator.__getitem__(batch_id)) | |
#Model Architecture, Layers, Compile | |
#Model fitgenerator | |
#Model Save Checkpoint |
Other strategies
- Databases -> CSV 50K Data Chunks Records -> Training and Save Checkpoint
- Checkpoint to Save for Each run and reuse for next 50K Chunk of Data
No comments:
Post a Comment