oceanwp 去掉cart price
去掉HOME
# -*- coding:gbk -*-
import requests
from bs4 import BeautifulSoup
import time
import io
import sys
import re
#sys.stdout = io.TextIOWrapper(sys.stdout,encoding='gb18030') #改变标准输出的默认编码
#print(k)
#kc=str(k.contents).replace(u'\xa0',u' ').replace(u'\u203a',u' ')
#print('\n'.join(['%s'%c for c in b.select('ol ol a')]))
##kc=str(k.contents).encode('gb18030').decode('gbk','ignore')
## tat=tat+kc[1:len(kc)-1]
#print(tat,file=ff) #,file=ff
m=requests.get('https://mp.weixin.qq.com/s/WUrnjh_sz91kMQZYcTAFXg')
b=BeautifulSoup(m.text,'lxml')
#python去掉标点,特殊符号
#string = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、[email protected]#¥%……&*()]+', "",line)
with open('外土司_微信文章.txt','w+') as ff:
for link in b.find_all('a'):
txt=link.get_text().encode('gb18030').decode('gbk','ignore')
hh=link.get('href')
if len(txt.strip())>5 :
print("title: %s , href: %s"%(txt,hh),file=ff)
print("title: %s , href: %s"%(txt,hh))
txt=re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?“”!,。?、[email protected]#¥%……&*()]+'","",txt)
fn=txt+'.html'
"""
if txt.find('/')>0:
fn=txt[:txt.find('/')-1]+'.html'
print(fn)
else:
fn=txt+'.html'
"""
print(fn+' , '+hh)
response=requests.get(hh)
with open(fn,'bw') as htmlff:
htmlff.write(response.content)
"""
hh=[h.get('href') for h in b.select('ol ol a')]
tt=[t['title'] for t in b.select('ol ol a')]
w3c='https://www.w3cschool.cn'
with open('外土司_微信文章.txt','w') as ff:
tat=''
for h in hh:
hlink=w3c+h
subb=BeautifulSoup(requests.get(hlink).text,'lxml')
print(hlink)
cc=subb.find(id='pro-mian-header').prettify() #从bs4.element.Tag转到list
dd=subb.find('div','content-bg').prettify() #从bs4.element.Tag转到list
tt=cc+dd
print(tt.encode('gb18030').decode('gbk','ignore'),file=ff)
print(''.join([ '-' for i in range(180)]))
time.sleep(5)
"""