爬取网页时调用tostring()中文乱码解决方案
- 出现乱码的代码
import requests
import re
from lxml import etree
with open('real_case.html', 'r', encoding='utf-8') as f:
c = f.read()
tree = etree.HTML(c)
table_element = tree.xpath("//div[@class='table-box'][1]/table/tbody/tr")
#正则表达式过滤掉<>
pattern1_attrib = re.compile(r"<.*?>")
for row in table_element:
try:
td1 = row.xpath('td')[0]
#调用tostring()后出现乱码
s1 = etree.tostring(td1).decode('utf-8')
s1 = pattern1_attrib.sub('', s1)
print(s1)
except Exception as error:
pass
乱码:
- 修正过后的代码
引入HTML包,使用unescape()方法
import requests
import re
from lxml import etree
#引入HTML包
import html
with open('real_case.html', 'r', encoding='utf-8') as f:
c = f.read()
tree = etree.HTML(c)
table_element = tree.xpath("//div[@class='table-box'][1]/table/tbody/tr")
pattern1_attrib = re.compile(r"<.*?>")
for row in table_element:
try:
td1 = row.xpath('td')[0]
s1 = etree.tostring(td1).decode('utf-8')
s1 = pattern1_attrib.sub('', s1)
# unescape() 此函数使用HTML5标准定义的规则将字符转换成对应的unicode字符。
s1 = html.unescape(s1)
print(s1)
except Exception as error:
pass
结果: