-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathexport_islands.py
More file actions
91 lines (72 loc) · 3.69 KB
/
export_islands.py
File metadata and controls
91 lines (72 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import time
if __name__ == "__main__":
# download the list of islands from http://islands.unep.ch/Tiocean.htm
base = "http://islands.unep.ch/"
url = "Tiocean.htm"
r = requests.get(base + url)
soup = bs(r.content, "html.parser")
# extract the names of the islands where the ocean is "Pacific"
# create a list of the names of the islands
islands = set()
islands_links = set()
for i in soup.body.contents:
if i.text.startswith("Pacific"):
j = i.next_sibling.next_sibling.next_sibling.contents[0]
print(j)
# extract the name of the island
name = j.text
link = j["href"].split("#")[0]
islands.add(name)
islands_links.add(link)
print("Found {} - {}".format(name, link))
# find the longitude and latitude of each island
# iterate over the links in islands_links
df = pd.DataFrame(columns=["ISLAND", "LATITUDE", "LONGITUDE"])
for link in islands_links:
time.sleep(1)
print("Processing {}".format(link))
# pull that data
r = requests.get(base + link)
soup = bs(r.content, "html.parser")
# iterate over the islands
for i in soup.body.find_all("body"):
for k in i.contents:
# print(i)
# element is a title if it is tag b, has a font tag
# if i.name == "b":
# print("is b: {}".format(i))
# if i.contents[0].name == "font":
# print("content 0 is font: {}".format(i))
# if i.contents[0].contents[0].name == "font":
# print("content 0 content 0 is font: {}".format(i))
# else:
# print("is not b: {}".format(i))
if k.name == "b" and k.contents[0].name == "font" and k.contents[0].contents[0].name == "font":
elem_name = k.contents[0].contents[0].text
print("found element {}".format(elem_name))
if elem_name in islands:
# yes we want this!
# get the longitude and latitude by iterating siblings of i until we get one that has the "º" in the text
longitude = None
latitude = None
for j in k.next_siblings:
if longitude != None and latitude != None:
df = df.append({"ISLAND": elem_name, "LATITUDE": latitude, "LONGITUDE": longitude}, ignore_index=True)
break
if "º" in j.text:
# if "N" in j.text or "S" in j.text:
if latitude == None:
# this is a latitude
latitude = j.text.replace("º", "").replace("N", "").replace("S", "").replace(" ", "")
latitude = float(latitude) * (-1 if "S" in j.text else 1)
# elif "E" in j.text or "W" in j.text:
elif latitude != None:
# this is a longitude
longitude = j.text.replace("º", "").replace("E", "").replace("W", "").replace(" ", "")
longitude = float(longitude) * (-1 if "W" in j.text else 1)
print("{} - {} - {}".format(elem_name, longitude, latitude))
# write the name of the islands to a csv file
df.to_csv("islands.csv", index=False)