# 先删除所有 style 和 script 标签 for tag in soup.find_all(['style', 'script']): tag.decompose()
# 逆序遍历所有标签,确保先处理子标签 for tag in soup.find_all(True)[::-1]: # 如果标签在保留列表中,则不处理 if tag.name in keep_tags: continue # 如果标签内没有显示内容(空白或空字符串),则删除它 ifnot tag.get_text(strip=True): tag.decompose()
if __name__ == '__main__': withopen('serp_s.txt', 'r') as f: html = f.read()
# 解析 HTML __name__soup = BeautifulSoup(html, 'html.parser') remove_empty_tags(__name__soup)
# 输出处理后的 HTML withopen('serp_s_2.txt', 'w') as f: f.write(__name__soup.prettify())