Python cloning and full screenshot of web page

It is applicable to both linux and windows. If it is useful, please like it~

Install Python3.11

Update: It can also run normally on Python3.6.

cd /root
wget https://www.python.org/ftp/python/3.11.0/Python-3.11.0.tgz
tar -xzf Python-3.11.0.tgz
yum -y install gcc zlib zlib-devel libffi libffi-devel
yum install readline-devel
yum install openssl-devel openssl11 openssl11-devel
export CFLAGS=$(pkg-config --cflags openssl11)
export LDFLAGS=$(pkg-config --libs openssl11)
cd /root/Python-3.11.0
./configure --prefix=/usr/python --with-ssl
make
make install
ln -s /usr/python/bin/python3 /usr/bin/python3
ln -s /usr/python/bin/pip3 /usr/bin/pip3

Runtime dependencies

python3.11 Firefox latest Firefox driver 0.33

pip3 install requests beautifulsoup4 selenium pillow urllib3 argparse -i https://pypi.tuna.tsinghua.edu.cn/simple

Centos7 comes with Firefox browser, uninstall it first, and then install the new one:

sudo yum install firefox

Then go to: Firefox download link and select geckodriver-v0.33.0-linux64.tar.gz

tar -zxvf geckodriver-v0.23.0-linux64.tar.gz
mv geckodriver /usr/bin
pip3 install selenium
pip3 install pillow

Clone web page

import os
import requests
import base64
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urljoin, urlparse
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import argparse
from os.path import dirname, abspath

# Disable warnings caused by not verifying SSL certificates
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

#Create a local directory structure and file path based on the URL and base path of the resource
def create_local_path(base_url, folder, resource_url):
    # If it is a data URI, return the path directly
    if resource_url.startswith('data:'):
        mime_info, _ = resource_url.split(',', 1)
        mime_type = mime_info.split(';')[0].split(':')[1]
        ext = mime_type.split('/')[1] if '/' in mime_type else 'png'
        filename = f"datauri_{int(time.time() * 1000)}.{ext}"
        local_path = os.path.join(folder, filename)
        return local_path

    # Parse the URL of the resource and create a folder structure
    parsed_url = urlparse(urljoin(base_url, resource_url))
    path_segments = parsed_url.path.lstrip('/').split('/')
    filename = path_segments[-1]
    local_dir = os.path.join(folder, *path_segments[:-1])
    os.makedirs(local_dir, exist_ok=True)
    local_path = os.path.join(local_dir, filename)
    return local_path

# Download resources, save them locally, and retain the directory structure of the original website
def download_resource(base_url, folder, resource_url, retries=2):
    local_path = create_local_path(base_url, folder, resource_url)
    if not os.path.exists(local_path):
        attempt=0
        while attempt < retries:
            try:
                # If it is a data URI, process the data and save it
                if resource_url.startswith('data:'):
                    header, encoded = resource_url.split(',', 1)
                    data = base64.b64decode(encoded)
                    with open(local_path, 'wb') as f:
                        f.write(data)
                    break # Break out of the loop after success
                else:
                    # Process common URL resources
                    response = requests.get(urljoin(base_url, resource_url), stream=True, verify=False)
                    if response.status_code == 200:
                        with open(local_path, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192):
                                f.write(chunk)
                        break # Break out of the loop after success
                    else:
                        print(f"Attempt {attempt + 1}: Error downloading {resource_url}: Status code {response.status_code}")
                        attempt+=1
                        time.sleep(2) # Wait 2 seconds and try again
            except Exception as e:
                print(f"Attempt {attempt + 1}: An error occurred while downloading {resource_url}: {e}")
                attempt+=1
                time.sleep(2) # Wait 2 seconds and try again

        if attempt == retries:
            print(f"Failed to download {resource_url} after {retries} attempts. Skipping...")
    return local_path

# Modify the link in HTML to point to the local path
def update_links(soup, tag, attribute, folder, base_url):
    for element in soup.find_all(tag, {attribute: True}):
        original_url = element[attribute]
        local_path = download_resource(base_url, folder, original_url)
        relative_path = os.path.relpath(local_path, folder)
        element[attribute] = relative_path

def clone_website(url, output_folder_name):
    # Get the absolute path to the directory where the script is located
    script_dir = dirname(abspath(__file__))
    # Create the full path to the output directory
    output_folder = os.path.join(script_dir, output_folder_name)

    # Make sure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Configure Selenium driver to use Firefox browser
    options = webdriver.FirefoxOptions()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)

    try:
        #Open target URL
        driver.get(url)
        # Get page source code
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Download and update links to all resources
        update_links(soup, 'link', 'href', output_folder, url)
        update_links(soup, 'script', 'src', output_folder, url)
        update_links(soup, 'img', 'src', output_folder, url)

        # Process all hyperlinks on the page, but here only links to files are processed
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if any(href.endswith(ext) for ext in ['.pdf', '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls']):
                download_resource(url, output_folder, href)

    except Exception as e:
        print(f"An error occurred while processing the page: {e}")
    finally:
        # Close browser
        driver.quit()

    #Write the processed HTML to a file
    with open(os.path.join(output_folder, 'index.html'), 'w', encoding='utf-8') as file:
        file.write(str(soup))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Clone a website for offline viewing.')
    parser.add_argument('target_url', help='The URL of the website to clone.')
    
    args = parser.parse_args()
    
    # You can define the folder name as a variable, such as "tempUrlCopyFolder"
    output_folder_name = "tempUrlCopyFolder"
    
    clone_website(args.target_url, output_folder_name)

Take a long screenshot of the web page

#pythonxxx.py url

from selenium import webdriver
from PIL import Image
import io
importsys
import os #Import os module
import time
import urllib3
from urllib3.exceptions import InsecureRequestWarning

urllib3.disable_warnings(InsecureRequestWarning)

def capture_full_page_screenshot(url, output_file):
    #Initialize browser options
    options = webdriver.FirefoxOptions()
    options.add_argument('--headless')

    # Start Firefox browser
    driver = webdriver.Firefox(options=options)

    # Set browser window size
    window_width = 1920
    window_height = 1080
    driver.set_window_size(window_width, window_height)

    try:
        # visit website
        driver.get(url)
        time.sleep(5) # Give the page time to load content

        # Get the total height of the page
        total_height = driver.execute_script("return document.body.parentNode.scrollHeight")

        # Start taking screenshots
        slices = []
        offset=0
        while offset < total_height:
            #Scroll to new screenshot location
            driver.execute_script(f"window.scrollTo(0, {offset});")
            time.sleep(2) # Wait for the page to load

            # Get screenshot
            png = driver.get_screenshot_as_png()
            screenshot = Image.open(io.BytesIO(png))
            slices.append(screenshot)

            offset + = window_height
            if offset < total_height:
                # Avoid overlapping sections and scroll less than one screen height
                offset -= (window_height // 10)

        # Splice the screenshots into a complete picture
        screenshot = Image.new('RGB', (window_width, total_height))
        offset=0
        for img in slices[:-1]:
            screenshot.paste(img, (0, offset))
            offset + = (window_height - (window_height // 10)) # Subtract the overlapping part
        #Add the last part of the screenshot
        last_img = slices[-1]
        screenshot.paste(last_img, (0, total_height - last_img.size[1]))

        # Save the screenshot to the directory where the script is located
        screenshot.save(output_file)
        print('Image saved successfully')

    except Exception as e:
        print(f'An error occurred: {e}')

    finally:
        # Close browser
        driver.quit()

if __name__ == "__main__":
    if len(sys.argv) > 1:
        url = sys.argv[1]
    else:
        print('Usage: python screenshot.py <URL>')
        sys.exit(1)

    # Get the directory where the script is located
    script_dir = os.path.dirname(os.path.abspath(__file__))

    #Create the path to the output file
    output_file = os.path.join(script_dir, 'tempUrlImg.png')

    # Call functions
    capture_full_page_screenshot(url, output_file)

The effect is as shown below:

Using format: python3 xxx.py url

The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeHomepageOverview 386,779 people are learning the system