-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcookingRecipe.py
More file actions
executable file
·111 lines (86 loc) · 3.79 KB
/
Copy pathcookingRecipe.py
File metadata and controls
executable file
·111 lines (86 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#Yixuan Qian
from __future__ import print_function
import json
import random
import time
import math
start_time = time.time()
def load_data(filename):
data = []
for line in open(filename):
data.append(json.loads(line))
return data
def get_cuisine_probs(data):
cuisine_count = {}
# Calculate number of times each ingredient and cuisine appears in the whole dataset
for recipe in data:
cuisine = recipe['cuisine']
cuisine_count[cuisine] = cuisine_count.get(cuisine, 0) + 1
# Divide by length of dataset
cuisine_probs = {cuisine : cuisine_count[cuisine]/float(len(data)) for cuisine in cuisine_count}
return cuisine_probs
def get_ingredient_prob_given_cuisine(data, all_cuisines, all_ingredients):
# Place a prior over each ingredient given the cuisine
ingredient_prob_given_cuisine = {cuisine : {ingredient : 0.15 for ingredient in all_ingredients} for cuisine in all_cuisines}
# Calculate number of times each ingredient occurs for a given cuisine
for recipe in data:
cuisine = recipe['cuisine']
ingredients = recipe['ingredients']
for ingredient in ingredients:
ingredient_prob_given_cuisine[cuisine][ingredient] += 1
# Divide by total number of times the cuisine appears (plus the prior distribution)
for cuisine in ingredient_prob_given_cuisine:
probs = ingredient_prob_given_cuisine[cuisine]
total = float(sum(probs[ingredient] for ingredient in probs))
ingredient_prob_given_cuisine[cuisine] = {ingredient : probs[ingredient]/total for ingredient in probs}
return ingredient_prob_given_cuisine
def get_max_cuisine(ingredient_list, cuisine_probs, ingredient_prob_given_cuisine):
max_prob = 0
best_cuisine = None
# Iterate over every cuisine
for cuisine in cuisine_probs:
# Set probability to p(cuisine)
prob = cuisine_probs[cuisine]
# Multiply by p(ingredient | cuisine) for each ingredient in ingredient_list
for ingredient in ingredient_list:
# Here we say .get(ingredient, 1) instead of .get(ingredient, 0)
# which causes it to ignore ingredients that have never been seen before
prob *= ingredient_prob_given_cuisine[cuisine].get(ingredient, 1)
# Check if new prob is higher
if prob > max_prob:
max_prob = prob
best_cuisine = cuisine
return best_cuisine
def test_classifier(data, k):
# Split data into training and test set
slice = int(math.ceil(len(data)/float(6)))
test_data = data[k*slice:k*slice+slice]
train_data = [j for j in data if j not in test_data]
# Train the classifier
cuisine_probs = get_cuisine_probs(train_data)
# Get list of all cuisines and all ingredients
all_cuisines = [cuisine for cuisine in cuisine_probs]
all_ingredients = set()
for recipe in train_data:
for ingredient in recipe['ingredients']:
all_ingredients.add(ingredient)
ingredient_prob_given_cuisine = get_ingredient_prob_given_cuisine(train_data, all_cuisines, all_ingredients)
# Test the classifier
results = []
for recipe in test_data:
cuisine = get_max_cuisine(recipe['ingredients'], cuisine_probs, ingredient_prob_given_cuisine)
results.append((cuisine, recipe['cuisine']))
return results
def eval_classifier(results):
return sum(guessed_cuisine == true_cuisine for (guessed_cuisine, true_cuisine) in results)/float(len(results))
sum_error = 0
dataset = load_data('training.json')
random.shuffle(dataset)
for k in range(0, 6):
re = test_classifier(dataset, k)
error = 1 - eval_classifier(re)
print(k+1, ' trial: ', error)
sum_error += error
print()
print('Average generalization error: ', sum_error/6)
print("Total time: %s seconds " % (time.time() - start_time))