用python快速过滤html指定标签函数

"""
@author: MR.N
@created: 2022/3/30 Wed.
@version: 1.0
"""
 
import io
import re
 
 
def filter_html_tags(text):
    htmltags = ['div', 'ul', 'li', 'ol', 'p', 'span', 'form', 'br',
                'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                'hr', 'input',
                'title', 'table', 'tbody', 'a',
                'i', 'strong', 'b', 'big', 'small', 'u', 's', 'strike',
                'img', 'center', 'dl', 'dt', 'font', 'em',
                'code', 'pre', 'link', 'meta', 'iframe', 'ins']
    blocktags = ['script', 'style']
    tabletags = ['tr', 'th', 'td']
    for tag in htmltags:
        # filter html tag with its attribute descriptions
        text = re.sub(f'<{tag}[^<>]*[/]?>', '', text)
        text = re.sub(f'</{tag}>', '', text)
    # '''
    for block in blocktags:
        re_block = re.compile('<\s*{block}[^>]*>[\S\s]*?<\s*/\s*{block}\s*>',re.I)#script
        text = re_block.sub('',text) #

    buffer = io.StringIO(text)
    text = ''
    line = buffer.readline()
    while line is not None and line != '':
        for tag in tabletags:
            if '<' + tag in line or '</' + tag in line:
                if len(line) < 2:
                    # len('\n') == 1
                    if ascii(line) == '\\n':
                        line = ''
                while '\n' in line:
                    line = line.replace('\n', '')
                line = re.sub(f'<{tag}[^<>]*[/]?>', '', line)
                line = re.sub(f'</{tag}>', '', line)
                # filter multiple spaces
                line = line.replace(' ', '')
        text += line
        line = buffer.readline()
    # '''
 
    # filter multiple empty lines
    while '\n\n' in text:
        text = text.replace("\n\n", '\n')
    return text

(本文内容根据网络资料整理和来自用户投稿,出于传递更多信息之目的,不代表本站其观点和立场。也不对其真实性、可靠性承担任何法律责任,特此声明!)

点赞(0) 打赏

评论列表 共有 0 条评论

暂无评论

微信小程序

微信扫一扫体验

立即
投稿

微信公众账号

微信扫一扫加关注

发表
评论
返回
顶部