-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
193 lines (162 loc) · 7.97 KB
/
main.py
File metadata and controls
193 lines (162 loc) · 7.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
visited_urls = set()
def create_directory_for_file(file_path):
"""Create directory for the file if it doesn't exist."""
directory = os.path.dirname(file_path)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
def download_file(session, url, directory, relative_path):
"""Download a file and save it to the specified directory with the given relative path."""
try:
response = session.get(url, timeout=10)
response.raise_for_status()
# Construct the full file path
file_path = os.path.join(directory, relative_path)
# Create necessary directories
create_directory_for_file(file_path)
# Save the file
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"Downloaded: {relative_path}")
return True
except Exception as e:
print(f"Failed to download {url}. Error: {e}")
return False
def extract_urls_from_css(css_content):
"""Extract all URLs from CSS content."""
url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
return url_pattern.findall(css_content)
def download_and_store_css(session, css_url, base_directory):
"""Download a CSS file and store it in the directory specified by its reference."""
css_parsed_url = urlparse(css_url)
css_relative_path = css_parsed_url.path.lstrip('/')
css_file_path = os.path.join(base_directory, css_relative_path)
if download_file(session, css_url, base_directory, css_relative_path):
# Load the downloaded CSS file to download referenced assets
with open(css_file_path, 'r', encoding='utf-8') as file:
css_content = file.read()
# Extract URLs from CSS and download assets
urls_in_css = extract_urls_from_css(css_content)
for asset_url in urls_in_css:
asset_url_clean = asset_url.strip('\'"')
if asset_url_clean.startswith('data:'): # Skip base64 encoded images
continue
asset_full_url = urljoin(css_url, asset_url_clean)
asset_parsed_url = urlparse(asset_full_url)
asset_relative_path = asset_parsed_url.path.lstrip('/')
download_file(session, asset_full_url, base_directory, asset_relative_path)
def extract_background_image_url(style_content):
"""Extract the URL from a background-image style."""
url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
match = url_pattern.search(style_content)
return match.group(1) if match else None
def clean_url(url):
"""Remove 'foores/' prefix from a URL if it exists."""
return url.replace('foores/', '')
def download_images(session, tag, attr, html_url, base_directory):
"""Download images from src, data-src, and similar attributes."""
img_url = tag.get(attr)
if img_url:
full_url = urljoin(html_url, img_url)
relative_path = clean_url(urlparse(full_url).path.lstrip('/'))
if download_file(session, full_url, base_directory, relative_path):
# Update the tag's attribute to point to the local file
tag[attr] = relative_path
def scrape_html(session, html_url, base_directory):
"""Scrape an HTML page and download its assets."""
html_parsed_url = urlparse(html_url)
html_relative_path = html_parsed_url.path.lstrip('/')
if not html_relative_path:
html_relative_path = 'index.html'
html_file_path = os.path.join(base_directory, html_relative_path)
# Ensure we don't visit the same page multiple times
if html_url in visited_urls:
return
visited_urls.add(html_url)
try:
response = session.get(html_url, timeout=10)
response.raise_for_status()
except Exception as e:
print(f"Failed to retrieve the webpage {html_url}. Error: {e}")
return
# Ensure base directory exists
create_directory_for_file(html_file_path)
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
# ---------- Download CSS Files ----------
css_files = soup.find_all('link', rel='stylesheet')
for css in css_files:
css_href = css.get('href')
if css_href:
css_url = urljoin(html_url, css_href)
download_and_store_css(session, css_url, base_directory)
css['href'] = css_href
# ---------- Download JS Files ----------
js_files = soup.find_all('script', src=True)
for js in js_files:
js_src = js.get('src')
if js_src:
js_url = urljoin(html_url, js_src)
js_relative_path = clean_url(urlparse(js_url).path.lstrip('/'))
download_file(session, js_url, base_directory, js_relative_path)
# ---------- Download Images from <img> Tags, Favicons, and Data Attributes ----------
img_tags = soup.find_all('img') + soup.find_all('link', rel=lambda x: x and 'icon' in x)
for img in img_tags:
download_images(session, img, 'src', html_url, base_directory)
download_images(session, img, 'data-src', html_url, base_directory)
# Handle images referenced in data attributes (e.g., data-bg, style attributes)
data_attrs = ['data-bg', 'data-original', 'data-background']
for tag in soup.find_all():
# Handle data-* attributes
for attr in data_attrs:
data_url = tag.get(attr)
if data_url:
# Extract the URL within the data attribute
data_url_clean = extract_background_image_url(data_url)
if data_url_clean:
data_full_url = urljoin(html_url, data_url_clean)
data_relative_path = clean_url(urlparse(data_full_url).path.lstrip('/'))
if download_file(session, data_full_url, base_directory, data_relative_path):
# Update the data attribute to point to the local file
tag[attr] = f"url('{data_relative_path}')"
# Handle style attributes
style_content = tag.get('style')
if style_content:
style_url = extract_background_image_url(style_content)
if style_url:
style_full_url = urljoin(html_url, style_url)
style_relative_path = clean_url(urlparse(style_full_url).path.lstrip('/'))
if download_file(session, style_full_url, base_directory, style_relative_path):
# Update the style attribute to point to the local file
tag['style'] = style_content.replace(style_url, style_relative_path)
# ---------- Find and Download Other Linked HTML Files ----------
html_links = soup.find_all('a', href=True)
for link in html_links:
link_href = link['href']
if link_href.endswith('.html') or link_href.endswith('/'):
# Construct the full URL and recurse
linked_html_url = urljoin(html_url, link_href)
scrape_html(session, linked_html_url, base_directory)
link['href'] = link_href
# ---------- Save the Updated HTML ----------
with open(html_file_path, 'w', encoding='utf-8') as file:
file.write(soup.prettify(formatter=None))
print(f"Saved HTML to {html_file_path}")
def scrape_website(url, base_directory):
"""Scrape the website and maintain its folder structure."""
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0'})
print(f"Starting to scrape website: {url}")
scrape_html(session, url, base_directory)
print("Scraping completed.")
# Load the URL and directory from environment variables
url = os.getenv('SCRAPE_URL', 'https://your-website.com') # Default to 'https://your-website.com' if not set
base_directory = os.getenv('SCRAPE_DIR', 'website_content') # Default to 'website_content' if not set
scrape_website(url, base_directory)