#!python # -*- mode: python; Encoding: utf-8; coding: utf-8 -*- # Last updated: <2025/04/03 20:39:21 +0900> """ HTML parse sample by BeautifulSoup4 Usage: python htmlparse01.py [sjis|shift_jis|utf8|utf-8] pip install beautifulsoup4 Windows10 x64 22H2 + Python 3.10.10 64bit """ import sys import re from bs4 import BeautifulSoup usage_mes = "Usage: python htmlparse01.py [sjis|shift_jis|utf8|utf-8]" def extract_all_file_paths(html_file, charset): """ ローカルHTMLファイルを読み込んで、 href, img src, script src, CSS内ファイルパスを抽出する。 Args: html_file (str): HTMLファイルのパス。 charset (str): utf-8 or shift_jis Returns: tuple: href, img src, script src, CSS内ファイルパスのリストをタプルで返す """ try: html = "" with open(html_file, "r", encoding=charset) as f: html = f.read() soup = BeautifulSoup(html, "html.parser") hrefs = [a["href"] for a in soup.find_all("a", href=True)] image_srcs = [img["src"] for img in soup.find_all("img", src=True)] script_srcs = [script["src"] for script in soup.find_all("script", src=True)] # スタイルシートファイルパスを抽出 stylesheet_paths = [] for link in soup.find_all("link", rel="stylesheet", href=True): stylesheet_paths.append(link["href"]) for style in soup.find_all("style"): # styleタグ内のurl()関数で指定されたファイルパスを抽出 urls = re.findall(r"url\(['\"]?([^'\")]+)['\"]?\)", style.text) stylesheet_paths.extend(urls) return hrefs, image_srcs, script_srcs, stylesheet_paths except FileNotFoundError: print(f"Error : Not found {html_file}") return [], [], [], [] except Exception as e: print(f"Error : {e}") return [], [], [], [] def main(): if len(sys.argv) == 2: html_file = sys.argv[1] charset = "utf-8" elif len(sys.argv) == 3: html_file = sys.argv[1] charset = sys.argv[2] else: print(usage_mes) sys.exit(1) if charset == "sjis": charset = "shift_jis" elif charset == "utf8": charset = "utf-8" hrefs, img_srcs, script_srcs, css_paths = extract_all_file_paths(html_file, charset) print("# href=") for href in sorted(list(set(hrefs))): print(href) print("# img src=") for src in sorted(list(set(img_srcs))): print(src) print("# script src=") for src in sorted(list(set(script_srcs))): print(src) print("# style src=") for path in sorted(list(set(css_paths))): print(path) if __name__ == "__main__": main()