Python 爬虫 提取整个页面的 链接 通用

提取整个页面的 链接 通用

Python
from html.parser import HTMLParser from urllib import parse import requests class LinkFinder(HTMLParser): def __init__(self, base_url, page_url): super().__init__() self.base_url = base_url self.page_url = page_url self.links = set() # When we call HTMLParser feed() this function is called when it # encounters an opening tag <a> def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = parse.urljoin(self.base_url, value) if self.base_url in url: self.links.add(url) # exclud def page_links(self): return self.links def error(self, message): pass if __name__ == '__main__': finder = LinkFinder("https://www.csai.cn", 'https://www.csai.cn/baoxian/') r = requests.get("https://www.csai.cn/baoxian/") finder.feed(r.text) urls = list(set(finder.page_links())) print(urls)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from html.parser import HTMLParser
from urllib import parse
import requests
 
 
class LinkFinder(HTMLParser):
    def __init__(self, base_url, page_url):
        super().__init__()
        self.base_url = base_url
        self.page_url = page_url
        self.links = set()
 
    # When we call HTMLParser feed() this function is called when it
    # encounters an opening tag <a>
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for (attribute, value) in attrs:
                if attribute == 'href':
                    url = parse.urljoin(self.base_url, value)
                    if self.base_url in url:
                        self.links.add(url)
    #                     exclud
 
    def page_links(self):
        return self.links
 
    def error(self, message):
        pass
 
 
if __name__ == '__main__':
    finder = LinkFinder("https://www.csai.cn", 'https://www.csai.cn/baoxian/')
    r = requests.get("https://www.csai.cn/baoxian/")
    finder.feed(r.text)
    urls = list(set(finder.page_links()))
    print(urls)
 

效果如图

Python 爬虫 提取整个页面的 链接 通用

Python 爬虫 提取整个页面的 链接 通用