-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_prepare.py
More file actions
95 lines (81 loc) · 3.9 KB
/
data_prepare.py
File metadata and controls
95 lines (81 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python
# coding=utf-8
import pandas as pd
from sklearn.cross_validation import train_test_split
TRAIN_PATH = '/home/yangqiao/pythonProject/PythonMLFramework/wine.csv'
TEST_PATH = '/home/yangqiao/pythonProject/PythonMLFramework/wine.csv'
# if you want to delete some columns, fill the list with the index
DELETE_INDEX = []
# the lable index
LABLE_INDEX = 0
# remove unwanted columns and set the last columns as label
def data_clean(data_file):
csv = pd.read_csv(data_file)
csv.columns = range(0, len(csv.columns), 1)
if (LABLE_INDEX != -1):
Y = csv.iloc[:, LABLE_INDEX]
X = csv.drop(LABLE_INDEX, axis=1)
else:
X = csv.iloc[:, :len(csv.columns.tolist()) - 1]
Y = csv.iloc[:, -1]
if (DELETE_INDEX != None):
# delete some columns
X = X.drop(DELETE_INDEX, axis=1)
data = pd.concat([X, Y], axis=1)
return data
# if unbalanced data, you need to sample some data from the higher count class
# sub_sample_time means the times of Most classes than Few class
# such as sub_sample_time=10, means Most classes is 10 times than Few class
def sub_sample(data_file, sub_sample_time):
train_black = data_file[data_file.iloc[:, len(data_file.columns) - 1] == 1]
train_white = data_file[data_file.iloc[:, len(data_file.columns) - 1] == 0]
# sampletime
percentage = float(train_black.shape[0] * sub_sample_time) / float(train_white.shape[0])
train_white_sample = train_white.sample(frac=percentage, replace=False)
train = pd.DataFrame(pd.concat([train_white_sample, train_black], axis=0))
x_train = train.iloc[:, :len(train.columns.tolist()) - 1]
y_train = train.iloc[:, -1]
return x_train, y_train
# train test split giving a percent
def get_train_test_split(data_file, split_percentage):
x_train, x_test, y_train, y_test = train_test_split(data_file.iloc[:, :len(data_file.columns.tolist()) - 1],
data_file.iloc[:, -1], test_size=split_percentage)
return x_train, x_test, y_train, y_test
# prepare data
# save_file_path: if set to '' or None, means no saving, or will save train and test file into folder
# log_path: the path of log
# sub_sample_time: the time of Most classes than Few class, if set to 0, means no sample
# train_test_split_percentage: train test split, if set to 0, means no need to do
def data_prepare(save_file_path, log_path, sub_sample_time, train_test_split_percentage):
data = data_clean(TRAIN_PATH)
# if if unbanlanced data you need do sample
if (sub_sample_time != 0):
data = sub_sample(data, sub_sample_time)
if (train_test_split_percentage != 0):
x_train, x_test, y_train, y_test = get_train_test_split(data, train_test_split_percentage)
else:
test = data_clean(TEST_PATH)
x_test = test.iloc[:, :len(test.columns.tolist()) - 1]
y_test = test.iloc[:, -1]
x_train = data.iloc[:, :len(data.columns.tolist()) - 1]
y_train = data.iloc[:, -1]
if (save_file_path):
csv_train = pd.concat([x_train, y_train], axis=1)
csv_test = pd.concat([x_test, y_test], axis=1)
csv_train.to_csv(save_file_path + 'train.csv', index=False)
csv_test.to_csv(save_file_path + 'test.csv', index=False)
fp = open(log_path + 'log.txt', 'a')
fp.write('trainData shape is ' + str(x_train.shape) + '\n')
fp.write('testData shape is ' + str(x_test.shape) + '\n')
fp.close()
if (save_file_path):
csv_train = pd.concat([x_train, y_train], axis=1)
csv_test = pd.concat([x_test, y_test], axis=1)
csv_train.to_csv(save_file_path + 'train.csv', index=False)
csv_test.to_csv(save_file_path + 'test.csv', index=False)
return x_train, x_test, y_train, y_test
dir = '/home/yangqiao/pythonProject/PythonMLFramework/log/'
if __name__ == "__main__":
x_train, x_test, y_train, y_test = data_prepare(None, dir, 0, 0)
print x_train.shape
print x_test.shape