It is applicable to both linux and windows. If it is useful, please like it~
Install Python3.11
Update: It can also run normally on Python3.6.
cd /root wget https://www.python.org/ftp/python/3.11.0/Python-3.11.0.tgz tar -xzf Python-3.11.0.tgz yum -y install gcc zlib zlib-devel libffi libffi-devel yum install readline-devel yum install openssl-devel openssl11 openssl11-devel export CFLAGS=$(pkg-config --cflags openssl11) export LDFLAGS=$(pkg-config --libs openssl11) cd /root/Python-3.11.0 ./configure --prefix=/usr/python --with-ssl make make install ln -s /usr/python/bin/python3 /usr/bin/python3 ln -s /usr/python/bin/pip3 /usr/bin/pip3
Runtime dependencies
python3.11 Firefox latest Firefox driver 0.33
pip3 install requests beautifulsoup4 selenium pillow urllib3 argparse -i https://pypi.tuna.tsinghua.edu.cn/simple
Centos7 comes with Firefox browser, uninstall it first, and then install the new one:
sudo yum install firefox
Then go to: Firefox download link and select geckodriver-v0.33.0-linux64.tar.gz
tar -zxvf geckodriver-v0.23.0-linux64.tar.gz mv geckodriver /usr/bin pip3 install selenium pip3 install pillow
Clone web page
import os import requests import base64 import time from bs4 import BeautifulSoup from selenium import webdriver from urllib.parse import urljoin, urlparse from requests.packages.urllib3.exceptions import InsecureRequestWarning import argparse from os.path import dirname, abspath # Disable warnings caused by not verifying SSL certificates requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #Create a local directory structure and file path based on the URL and base path of the resource def create_local_path(base_url, folder, resource_url): # If it is a data URI, return the path directly if resource_url.startswith('data:'): mime_info, _ = resource_url.split(',', 1) mime_type = mime_info.split(';')[0].split(':')[1] ext = mime_type.split('/')[1] if '/' in mime_type else 'png' filename = f"datauri_{int(time.time() * 1000)}.{ext}" local_path = os.path.join(folder, filename) return local_path # Parse the URL of the resource and create a folder structure parsed_url = urlparse(urljoin(base_url, resource_url)) path_segments = parsed_url.path.lstrip('/').split('/') filename = path_segments[-1] local_dir = os.path.join(folder, *path_segments[:-1]) os.makedirs(local_dir, exist_ok=True) local_path = os.path.join(local_dir, filename) return local_path # Download resources, save them locally, and retain the directory structure of the original website def download_resource(base_url, folder, resource_url, retries=2): local_path = create_local_path(base_url, folder, resource_url) if not os.path.exists(local_path): attempt=0 while attempt < retries: try: # If it is a data URI, process the data and save it if resource_url.startswith('data:'): header, encoded = resource_url.split(',', 1) data = base64.b64decode(encoded) with open(local_path, 'wb') as f: f.write(data) break # Break out of the loop after success else: # Process common URL resources response = requests.get(urljoin(base_url, resource_url), stream=True, verify=False) if response.status_code == 200: with open(local_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) break # Break out of the loop after success else: print(f"Attempt {attempt + 1}: Error downloading {resource_url}: Status code {response.status_code}") attempt+=1 time.sleep(2) # Wait 2 seconds and try again except Exception as e: print(f"Attempt {attempt + 1}: An error occurred while downloading {resource_url}: {e}") attempt+=1 time.sleep(2) # Wait 2 seconds and try again if attempt == retries: print(f"Failed to download {resource_url} after {retries} attempts. Skipping...") return local_path # Modify the link in HTML to point to the local path def update_links(soup, tag, attribute, folder, base_url): for element in soup.find_all(tag, {attribute: True}): original_url = element[attribute] local_path = download_resource(base_url, folder, original_url) relative_path = os.path.relpath(local_path, folder) element[attribute] = relative_path def clone_website(url, output_folder_name): # Get the absolute path to the directory where the script is located script_dir = dirname(abspath(__file__)) # Create the full path to the output directory output_folder = os.path.join(script_dir, output_folder_name) # Make sure the output directory exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Configure Selenium driver to use Firefox browser options = webdriver.FirefoxOptions() options.add_argument('--headless') driver = webdriver.Firefox(options=options) try: #Open target URL driver.get(url) # Get page source code soup = BeautifulSoup(driver.page_source, 'html.parser') # Download and update links to all resources update_links(soup, 'link', 'href', output_folder, url) update_links(soup, 'script', 'src', output_folder, url) update_links(soup, 'img', 'src', output_folder, url) # Process all hyperlinks on the page, but here only links to files are processed for a_tag in soup.find_all('a', href=True): href = a_tag['href'] if any(href.endswith(ext) for ext in ['.pdf', '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls']): download_resource(url, output_folder, href) except Exception as e: print(f"An error occurred while processing the page: {e}") finally: # Close browser driver.quit() #Write the processed HTML to a file with open(os.path.join(output_folder, 'index.html'), 'w', encoding='utf-8') as file: file.write(str(soup)) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Clone a website for offline viewing.') parser.add_argument('target_url', help='The URL of the website to clone.') args = parser.parse_args() # You can define the folder name as a variable, such as "tempUrlCopyFolder" output_folder_name = "tempUrlCopyFolder" clone_website(args.target_url, output_folder_name)
Take a long screenshot of the web page
#pythonxxx.py url from selenium import webdriver from PIL import Image import io importsys import os #Import os module import time import urllib3 from urllib3.exceptions import InsecureRequestWarning urllib3.disable_warnings(InsecureRequestWarning) def capture_full_page_screenshot(url, output_file): #Initialize browser options options = webdriver.FirefoxOptions() options.add_argument('--headless') # Start Firefox browser driver = webdriver.Firefox(options=options) # Set browser window size window_width = 1920 window_height = 1080 driver.set_window_size(window_width, window_height) try: # visit website driver.get(url) time.sleep(5) # Give the page time to load content # Get the total height of the page total_height = driver.execute_script("return document.body.parentNode.scrollHeight") # Start taking screenshots slices = [] offset=0 while offset < total_height: #Scroll to new screenshot location driver.execute_script(f"window.scrollTo(0, {offset});") time.sleep(2) # Wait for the page to load # Get screenshot png = driver.get_screenshot_as_png() screenshot = Image.open(io.BytesIO(png)) slices.append(screenshot) offset + = window_height if offset < total_height: # Avoid overlapping sections and scroll less than one screen height offset -= (window_height // 10) # Splice the screenshots into a complete picture screenshot = Image.new('RGB', (window_width, total_height)) offset=0 for img in slices[:-1]: screenshot.paste(img, (0, offset)) offset + = (window_height - (window_height // 10)) # Subtract the overlapping part #Add the last part of the screenshot last_img = slices[-1] screenshot.paste(last_img, (0, total_height - last_img.size[1])) # Save the screenshot to the directory where the script is located screenshot.save(output_file) print('Image saved successfully') except Exception as e: print(f'An error occurred: {e}') finally: # Close browser driver.quit() if __name__ == "__main__": if len(sys.argv) > 1: url = sys.argv[1] else: print('Usage: python screenshot.py <URL>') sys.exit(1) # Get the directory where the script is located script_dir = os.path.dirname(os.path.abspath(__file__)) #Create the path to the output file output_file = os.path.join(script_dir, 'tempUrlImg.png') # Call functions capture_full_page_screenshot(url, output_file)
The effect is as shown below:
Using format: python3 xxx.py url
The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeHomepageOverview 386,779 people are learning the system