【实例】python bs4 beautifulsoup + urllib.request 提取网址

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> page=requests.get(url).text
>>> pagesoup=BeautifulSoup(page,'lxml')
>>> for link  in pagesoup.find_all(name='a',attrs={"href":re.compile(r'^http:')}):
...     print(link.get('href'))
...
http://www.szu.edu.cn
http://news.szu.edu.cn
http://210.39.3.155:9090/goLogin.do
http://www.szu.edu.cn/yxjg/xyxb.htm
http://www.szu.edu.cn/yxjg/znbm.htm
http://www.miibeian.gov.cn

>>>

--------------------------------

>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")

>>> print(soup.get_text())


---------------------

>>> # -*- coding: utf-8 -*-
...
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")

>>> print(soup.get_text())

-----------------------------------

>>> webdata = requests.get(url)
>>> webdata.encoding = 'GBK'

>>> print(webdata.text)

----------------------------

【实例】python bs4 beautifulsoup + urllib.request 提取网址

>>> print(html)
<Response [200]>

>>> print(html.text)

html_doc = html.text

>>> html_doc = html.text

>>> print(html_doc)

--------------------------------------------

>>> soup = BeautifulSoup(html_doc, "html.parser")
>>> didi = soup.b.next_element.strip()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: 'NoneType' object is not callable
>>> didi = soup.tr.next_element.strip()
>>> print(didi)


>>> didi = soup.br.next_element.strip()

>>> print(didi)

----------------------------------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>>

----------------------------------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'<tr>(.*?)</tr>'
>>> m_tr =  re.findall(res_tr,html,re.S|re.M)


>>> for line in m_tr:

...   print(line)
...   res_th = r'<th>(.*?)</th>'
...   m_th = re.findall(res_th,line,re.S|re.M)
...   for mm in m_th:
...     print(unicode(mm,'utf-8'),)
...   res_td = r'<td>(.*?)</td>'
...   m_td = re.findall(res_td,line,re.S|re.M)
...   for nn in m_td:
...     print(unicode(nn,'utf-8'))
...


<td align="center" valign="top" height="50"><script language="JavaScript">
function getstr(str)
{
        var tag=str.indexOf("?");
//////  if(tag>0 && escape(str).indexOf("%u")>0)
        if(tag>0 && /.*[\u4e00-\u9fa5]+.*$/.test(str))
                return str.substring(0,tag);
        else
                return str;
}
</script>


<table border="0" cellspacing="0" cellpadding="0" width="980">
<tr><td height=20 colspan="2">


<table border="0" cellspacing="0" width=980 cellpadding="0">
<tr>
<td height=25></td>
<td width="600" style="font-size: 9pt" align=right valign=bottom><b><a href="http://www.szu.edu.cn"><font color="#FFFFFF">深大官网</font></a> <a href="/2014/en"><font color="#FFFFFF">English</font></a></b>&nbsp;</td>


<td height=1 colspan="2" bgcolor="#EEEEEE"></td>


<td colspan="2" class=tbcolor2>
<table border="0" width="100%" cellspacing="0" cellpadding="0">
<tr>
<td class="fontcolor1" style="font-size: 9pt">


  <img border="0" src="/images/who.png">&nbsp;<a href="#" onclick="window.location.href='/manage/caslogin.asp?rurl='+getstr(window.location.href);return false;" class=fontcolor1>您未登录|点击统一身份认证</a>

----------------上面是关于 td tr th 的信息提取--------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'<tr>(.*?)</tr>'
>>> m_tr =  re.findall(res_tr,html,re.S|re.M)
>>> for line in m_tr:
...   print(line)
...   res_th = r'<th>(.*?)</th>'
...   m_th = re.findall(res_th,line,re.S|re.M)
...   for mm in m_th:
...     print(unicode(mm,'utf-8'),)
...   res_td = r'<td>(.*?)</td>'
...   m_td = re.findall(res_td,line,re.S|re.M)
...   for nn in m_td:
...     print(unicode(nn,'utf-8'))
...

--------------下面是获取的超链接文本-----------------------------

>>> res = r'<a .*?>(.*?)</a>'
>>> mm = re.findall(res,html,re.S|re.M)
>>> for value in mm:

...   print(value)

【实例】python bs4 beautifulsoup + urllib.request 提取网址

-------------------

突然在想深度的问题 :参考文章:http://blog.****.net/u012063507/article/details/72831751