diff --git a/DataIngestion.py b/DataIngestion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7898617fc58d9af637f304fed1f8abaf47af6b2d
--- /dev/null
+++ b/DataIngestion.py
@@ -0,0 +1,66 @@
+import pandas as pd
+import numpy as np
+import os
+from sklearn.model_selection import train_test_split
+
+class Load:
+ def __init__(self, fileName):
+ self.fileName = fileName
+
+ def loadCVS(filename):
+ data = pd.read_csv("filename.csv")
+ return data
+
+ def loadJSON(filename):
+ data = pd.read_json('data.json')
+ return data
+
+class Split:
+ def __init__(self, fileName):
+ self.fileName = fileName
+
+ def splitIntoTrainandTestData(df):
+ X = df.iloc[:, :-1]
+ y = df.iloc[:, -1]
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)
+ return X_train, X_test, y_train, y_test
+
+
+ def allowUserToSelectSetsOfData(df):
+ return df
+
+class Validation:
+ def __init__(self, fileName):
+ self.fileName = fileName
+
+ def checkFileType(fileName):
+ if not (os.path.splitext(fileName)[1] == ".cvs" or os.path.splitext(fileName)[1] == ".json"):
+ return False
+ return True
+
+ def checkNullValues(data):
+ for x in range(len(data)):
+ for y in range(len(data)):
+ if data[x][y] == " ":
+ return False
+ return True
+
+ def checkFeatureNames(df):
+ for col_name in df.columns:
+ print(col_name)
+
+ def checkInfValues(df):
+ count = np.isinf(df).values.sum()
+ if count > 0:
+ return False
+ return True
+
+
+class DataIngestion:
+ def __init__(self, load,split,validation):
+ self.load = load
+ self.split = split
+ self.validation = validation
+
+
+