PythonMLFramework/data_prepare.py at master · yang1young/PythonMLFramework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python
# coding=utf-8
import pandas as pd
from sklearn.cross_validation import train_test_split

TRAIN_PATH = '/home/yangqiao/pythonProject/PythonMLFramework/wine.csv'
TEST_PATH = '/home/yangqiao/pythonProject/PythonMLFramework/wine.csv'
# if you want to delete some columns, fill the list with the index
DELETE_INDEX = []
# the lable index
LABLE_INDEX = 0


# remove unwanted columns and set the last columns as label
def data_clean(data_file):
    csv = pd.read_csv(data_file)
    csv.columns = range(0, len(csv.columns), 1)
    if (LABLE_INDEX != -1):
        Y = csv.iloc[:, LABLE_INDEX]
        X = csv.drop(LABLE_INDEX, axis=1)
    else:
        X = csv.iloc[:, :len(csv.columns.tolist()) - 1]
        Y = csv.iloc[:, -1]
    if (DELETE_INDEX != None):
        # delete some columns
        X = X.drop(DELETE_INDEX, axis=1)
    data = pd.concat([X, Y], axis=1)
    return data


# if unbalanced data, you need to sample some data from the higher count class
# sub_sample_time means the times of Most classes than Few class
# such as sub_sample_time=10, means Most classes is 10 times than Few class
def sub_sample(data_file, sub_sample_time):
    train_black = data_file[data_file.iloc[:, len(data_file.columns) - 1] == 1]
    train_white = data_file[data_file.iloc[:, len(data_file.columns) - 1] == 0]
    # sampletime
    percentage = float(train_black.shape[0] * sub_sample_time) / float(train_white.shape[0])
    train_white_sample = train_white.sample(frac=percentage, replace=False)
    train = pd.DataFrame(pd.concat([train_white_sample, train_black], axis=0))
    x_train = train.iloc[:, :len(train.columns.tolist()) - 1]
    y_train = train.iloc[:, -1]
    return x_train, y_train


# train test split giving a percent
def get_train_test_split(data_file, split_percentage):
    x_train, x_test, y_train, y_test = train_test_split(data_file.iloc[:, :len(data_file.columns.tolist()) - 1],
                                                        data_file.iloc[:, -1], test_size=split_percentage)
    return x_train, x_test, y_train, y_test


# prepare data
# save_file_path: if set to '' or None, means no saving, or will save train and test file into folder
# log_path: the path of log
# sub_sample_time: the time of Most classes than Few class, if set to 0, means no sample
# train_test_split_percentage: train test split, if set to 0, means no need to do
def data_prepare(save_file_path, log_path, sub_sample_time, train_test_split_percentage):
    data = data_clean(TRAIN_PATH)
    # if if unbanlanced data you need do sample
    if (sub_sample_time != 0):
        data = sub_sample(data, sub_sample_time)
    if (train_test_split_percentage != 0):
        x_train, x_test, y_train, y_test = get_train_test_split(data, train_test_split_percentage)
    else:
        test = data_clean(TEST_PATH)
        x_test = test.iloc[:, :len(test.columns.tolist()) - 1]
        y_test = test.iloc[:, -1]
        x_train = data.iloc[:, :len(data.columns.tolist()) - 1]
        y_train = data.iloc[:, -1]
    if (save_file_path):
        csv_train = pd.concat([x_train, y_train], axis=1)
        csv_test = pd.concat([x_test, y_test], axis=1)
        csv_train.to_csv(save_file_path + 'train.csv', index=False)
        csv_test.to_csv(save_file_path + 'test.csv', index=False)

    fp = open(log_path + 'log.txt', 'a')
    fp.write('trainData shape is ' + str(x_train.shape) + '\n')
    fp.write('testData shape is ' + str(x_test.shape) + '\n')
    fp.close()

    if (save_file_path):
        csv_train = pd.concat([x_train, y_train], axis=1)
        csv_test = pd.concat([x_test, y_test], axis=1)
        csv_train.to_csv(save_file_path + 'train.csv', index=False)
        csv_test.to_csv(save_file_path + 'test.csv', index=False)

    return x_train, x_test, y_train, y_test


dir = '/home/yangqiao/pythonProject/PythonMLFramework/log/'
if __name__ == "__main__":
    x_train, x_test, y_train, y_test = data_prepare(None, dir, 0, 0)
    print x_train.shape
    print x_test.shape