去除html页面中的无内容空标签

from bs4 import BeautifulSoup


def remove_empty_tags(soup, keep_tags=None):
    """
    删除 soup 中无内容的标签：
      - 删除 style、script 标签
      - 删除那些内部文字（调用 get_text(strip=True)）为空的标签，
        除非该标签在 keep_tags 中（例如 img、br、hr 等）。
    """
    # 默认保留一些通常用于显示内容但内部无文字的标签
    if keep_tags is None:
        keep_tags = ['img', 'input', 'br', 'hr']

    # 先删除所有 style 和 script 标签
    for tag in soup.find_all(['style', 'script']):
        tag.decompose()

    # 逆序遍历所有标签，确保先处理子标签
    for tag in soup.find_all(True)[::-1]:
        # 如果标签在保留列表中，则不处理
        if tag.name in keep_tags:
            continue
        # 如果标签内没有显示内容（空白或空字符串），则删除它
        if not tag.get_text(strip=True):
            tag.decompose()


if __name__ == '__main__':
    with open('serp_s.txt', 'r') as f:
        html = f.read()

    # 解析 HTML
    __name__soup = BeautifulSoup(html, 'html.parser')
    remove_empty_tags(__name__soup)

    # 输出处理后的 HTML
    with open('serp_s_2.txt', 'w') as f:
        f.write(__name__soup.prettify())