-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathData_preprocessing.py
More file actions
93 lines (62 loc) · 3.47 KB
/
Data_preprocessing.py
File metadata and controls
93 lines (62 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
#
import copy
import pickle
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }
def create_lookup_table(vocab):
# make a list of unique characters
vocab = set(list(vocab))
vocab_to_int = copy.copy(CODES)
for v_i, v in enumerate(vocab, len(CODES)):
vocab_to_int[v] = v_i
int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
return vocab_to_int, int_to_vocab
def text_to_ids(source_words, target_words, source_vocab_to_int, target_vocab_to_int):
#1st, 2nd args: raw string text to be converted
#3rd, 4th args: lookup tables for 1st and 2nd args respectively
#return: A tuple of lists (source_id_text, target_id_text) converted
# empty list of converted words
source_text_id = []
target_text_id = []
max_source_word_length = max([len(word) for word in source_words])
max_target_word_length = max([len(word) for word in target_words])
# iterating through each word (# of words in source&target is the same)
for i in range(len(source_words)):
# extract words one by one
source_word = source_words[i]
target_word = target_words[i]
# make a list of characters (extraction) from the chosen word
source_tokens = list(source_word)
target_tokens = list(target_word)
# empty list of converted words to index in the chosen word
source_token_id = []
target_token_id = []
for index, token in enumerate(source_tokens):
source_token_id.append(source_vocab_to_int[token])
for index, token in enumerate(target_tokens):
target_token_id.append(target_vocab_to_int[token])
# put <EOS> token at the end of the chosen target word
# this token suggests when to stop creating a sequence
target_token_id.append(target_vocab_to_int['<EOS>'])
# add each converted words in the final list
source_text_id.append(source_token_id)
target_text_id.append(target_token_id)
return source_text_id, target_text_id
def preprocess(source_text, target_text):
Englishvocab = 'abcdefghijklmnopqrstuvwxyz'
Hindivocab = 'ँंॉॆॊॏऺऻॎःािीुूेैोौअआइईउऊएऐओऔकखगघचछजझटठडढणतथदधनपफबभमयरलवशषसहज्ञक्षश्रज़रफ़ड़ढ़ख़क़ग़ळृृ़़ऑ'
# create lookup tables for English and Hindi data
source_vocab_to_int, source_int_to_vocab = create_lookup_table(Englishvocab)
target_vocab_to_int, target_int_to_vocab = create_lookup_table(Hindivocab)
# create list of words whose characters are represented in index
source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)
# Save data for later use
pickle.dump((
(source_text, target_text),
(source_vocab_to_int, target_vocab_to_int),
(source_int_to_vocab, target_int_to_vocab)), open('preprocess.p', 'wb'))
return source_text,target_text,source_vocab_to_int,target_vocab_to_int,source_int_to_vocab,target_int_to_vocab
# function to load the preprocessed data
def load_preprocess():
with open('preprocess.p', mode='rb') as in_file:
return pickle.load(in_file)