-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpulldata.py
More file actions
77 lines (72 loc) · 3 KB
/
pulldata.py
File metadata and controls
77 lines (72 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# off_india_pull.py
import csv, time, requests, sys
from tqdm import tqdm
BASE = "https://world.openfoodfacts.org/country/india"
HEADERS = {"User-Agent": "NutriVisionAI/1.0 (india-pipeline)"}
FIELDS = [
"code","product_name","brands","categories","countries_tags","quantity","serving_size",
"energy_kcal_100g","proteins_100g","fat_100g","carbohydrates_100g",
"sugars_100g","fiber_100g","salt_100g","sodium_100g",
"nutriscore_grade","allergens","ingredients_text","image_url","image_small_url",
"last_modified_t"
]
def row_from_product(p):
n = p.get("nutriments", {}) or {}
return {
"code": p.get("code"),
"product_name": p.get("product_name",""),
"brands": p.get("brands",""),
"categories": p.get("categories",""),
"countries_tags": ",".join(p.get("countries_tags",[]) or []),
"quantity": p.get("quantity",""),
"serving_size": p.get("serving_size",""),
"energy_kcal_100g": n.get("energy-kcal_100g") or n.get("energy_100g"),
"proteins_100g": n.get("proteins_100g"),
"fat_100g": n.get("fat_100g"),
"carbohydrates_100g": n.get("carbohydrates_100g"),
"sugars_100g": n.get("sugars_100g"),
"fiber_100g": n.get("fiber_100g"),
"salt_100g": n.get("salt_100g"),
"sodium_100g": n.get("sodium_100g"),
"nutriscore_grade": p.get("nutriscore_grade",""),
"allergens": p.get("allergens",""),
"ingredients_text": p.get("ingredients_text",""),
"image_url": p.get("image_url",""),
"image_small_url": p.get("image_small_url",""),
"last_modified_t": p.get("last_modified_t")
}
def main(out_csv="off_india_products.csv", max_pages=None, pause=0.25):
seen = set()
page = 1
total_written = 0
with open(out_csv, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=FIELDS, quoting=csv.QUOTE_MINIMAL)
w.writeheader()
with requests.Session() as s:
s.headers.update(HEADERS)
pbar = tqdm(desc="Pages", unit="page")
while True:
url = f"{BASE}/{page}.json" if page > 1 else f"{BASE}.json"
r = s.get(url, timeout=30)
r.raise_for_status()
data = r.json()
products = data.get("products", []) or []
if not products:
break
for p in products:
code = p.get("code")
if not code or code in seen:
continue
seen.add(code)
w.writerow(row_from_product(p))
total_written += 1
pbar.update(1)
page += 1
if max_pages and page > max_pages:
break
time.sleep(pause) # be polite
pbar.close()
print(f"Done. Wrote {total_written} rows to {out_csv}")
if __name__ == "__main__":
max_pages = int(sys.argv[1]) if len(sys.argv) > 1 else None
main(max_pages=max_pages)