【实例】python bs4 beautifulsoup + urllib.request 提取网址
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> page=requests.get(url).text
>>> pagesoup=BeautifulSoup(page,'lxml')
>>> for link in pagesoup.find_all(name='a',attrs={"href":re.compile(r'^http:')}):
... print(link.get('href'))
...
http://www.szu.edu.cn
http://news.szu.edu.cn
http://210.39.3.155:9090/goLogin.do
http://www.szu.edu.cn/yxjg/xyxb.htm
http://www.szu.edu.cn/yxjg/znbm.htm
http://www.miibeian.gov.cn
>>>
--------------------------------
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> print(soup.get_text())
---------------------
>>> # -*- coding: utf-8 -*-
...
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> print(soup.get_text())
-----------------------------------
>>> webdata = requests.get(url)
>>> webdata.encoding = 'GBK'
>>> print(webdata.text)
----------------------------
>>> print(html)
<Response [200]>
>>> print(html.text)
html_doc = html.text
>>> html_doc = html.text
>>> print(html_doc)
--------------------------------------------
>>> soup = BeautifulSoup(html_doc, "html.parser")
>>> didi = soup.b.next_element.strip()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'NoneType' object is not callable
>>> didi = soup.tr.next_element.strip()
>>> print(didi)
>>> didi = soup.br.next_element.strip()
>>> print(didi)
----------------------------------------------
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>>
----------------------------------------------
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'<tr>(.*?)</tr>'
>>> m_tr = re.findall(res_tr,html,re.S|re.M)
>>> for line in m_tr:
... print(line)... res_th = r'<th>(.*?)</th>'
... m_th = re.findall(res_th,line,re.S|re.M)
... for mm in m_th:
... print(unicode(mm,'utf-8'),)
... res_td = r'<td>(.*?)</td>'
... m_td = re.findall(res_td,line,re.S|re.M)
... for nn in m_td:
... print(unicode(nn,'utf-8'))
...
<td align="center" valign="top" height="50"><script language="JavaScript">
function getstr(str)
{
var tag=str.indexOf("?");
////// if(tag>0 && escape(str).indexOf("%u")>0)
if(tag>0 && /.*[\u4e00-\u9fa5]+.*$/.test(str))
return str.substring(0,tag);
else
return str;
}
</script>
<table border="0" cellspacing="0" cellpadding="0" width="980">
<tr><td height=20 colspan="2">
<table border="0" cellspacing="0" width=980 cellpadding="0">
<tr>
<td height=25></td>
<td width="600" style="font-size: 9pt" align=right valign=bottom><b><a href="http://www.szu.edu.cn"><font color="#FFFFFF">深大官网</font></a> <a href="/2014/en"><font color="#FFFFFF">English</font></a></b> </td>
<td height=1 colspan="2" bgcolor="#EEEEEE"></td>
<td colspan="2" class=tbcolor2>
<table border="0" width="100%" cellspacing="0" cellpadding="0">
<tr>
<td class="fontcolor1" style="font-size: 9pt">
<img border="0" src="/images/who.png"> <a href="#" onclick="window.location.href='/manage/caslogin.asp?rurl='+getstr(window.location.href);return false;" class=fontcolor1>您未登录|点击统一身份认证</a>
----------------上面是关于 td tr th 的信息提取--------------------
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'<tr>(.*?)</tr>'
>>> m_tr = re.findall(res_tr,html,re.S|re.M)
>>> for line in m_tr:
... print(line)
... res_th = r'<th>(.*?)</th>'
... m_th = re.findall(res_th,line,re.S|re.M)
... for mm in m_th:
... print(unicode(mm,'utf-8'),)
... res_td = r'<td>(.*?)</td>'
... m_td = re.findall(res_td,line,re.S|re.M)
... for nn in m_td:
... print(unicode(nn,'utf-8'))
...
--------------下面是获取的超链接文本-----------------------------
>>> res = r'<a .*?>(.*?)</a>'
>>> mm = re.findall(res,html,re.S|re.M)
>>> for value in mm:
... print(value)
-------------------
突然在想深度的问题 :参考文章:http://blog.****.net/u012063507/article/details/72831751