Python Wiki路径搜索


在个人心血来潮中,我编写了一些代码来搜索任意两篇*文章之间最短的一系列链接。事实证明,如果它不仅仅是一个链接或两个深度,而且它很有效,并且需要很长时间才能找到目标。我将最终跟踪并利用链接路径和内容,但我希望首先以最佳方式使搜索工作正常。有没有更快的方法来做到这一点或在这里削减一些主要角落的好方法?Python Wiki路径搜索

import urllib2 
from bs4 import BeautifulSoup 
Start = '' 
End = '' 

#Using BeautifulSoup, this grabs the page 
def soup_request(target): 
    request = urllib2.Request(target) 
    request.add_header("User-Agent", "Mozilla/5.0") 
    page = urllib2.urlopen(target) 
    soup = BeautifulSoup(page) 
    return soup 

#This will grab all Wiki links off a given page 
def get_links(Start): 
    soup = soup_request(Start) 
    Wiki_links = [] 
    #Finds all links 
    for url in soup.findAll('a'): 
     result = url.get('href') 
      if str(result)[:5] == '/wiki': 
    for q in range(len(Wiki_links)): 
     Wiki_links[q] = ''+str(Wiki_links[q]) 
    print "Got new links from",Start 
    return Wiki_links 

#This will check all the given links to see if the title matches the goal webpage 
def check_links(Links,End): 
    goalsoup = soup_request(End) 
    goaltitle = goalsoup.html.title 
    Found = False 
    count = 0 
    for q in Links: 
     if Found: 
     length = len(Links) 
     #Runs through all the given links and checks their titles for correct one 
     if q is not None: 
      count += 1 
      soup = soup_request(q) 
      print "Checked",count,"links out of",length 
       title = soup.html.head.title 
       if title == goaltitle: 
        Found = True 
        print "Found it!" 
       print 'doh' 
    return Found 

#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links 
def wiki_crawl(Start, End, depth): 
    Old_Links = [Start] 
    count = depth 
    while count > 0: 
     New_Links = [] 
     for q in range(len(Old_Links)): 
     Found = check_links(New_Links,End) 
     if Found: 
      print "All done." 
     Old_Links = New_Links 
     count -= 1 
     print "_______________________________________________________________ROUND DONE" 
    if not Found: 
     print "Did not find the page, you must go deeper!" 

wiki_crawl(Start, End, 2) 


def take_out_parenthesis(st): 
string = list(st) 
for a in string: 
    if a == '(': 
     del string[st.find(a)] 
    if a == ')': 
     del string[st.find(a) - 1] 
return ''.join(string) 

def take_out_tags(string): 
    st = list(string) 
    odd = ['<', '>'] 
    times = 0 
    for a in string: 
     if a in odd: 
      times += 1 
    times /= 2 
    for b in range(times): 
     start = string.find('<') - 1 
     end = string.find('>') 
     bet = end - start + 1 
     for a in range(bet): 
      del st[start] 
     string = ''.join(st) 
    return string 

def take_out_brackets(string): 
    st = list(string) 
    odd = ['[', ']'] 
    times = 0 
    for a in string: 
     if a in odd: 
      times += 1 
    times /= 2 
    for b in range(times): 
     start = string.find('[') - 1 
     end = string.find(']') 
     bet = end - start + 1 
     for a in range(bet): 
      del st[start] 
     string = ''.join(st) 
    return string 

def take_from_web_page(text): 
    n = 0 
    url = text.replace(" ", "_") 
    search = "" % url 
    page = urllib2.urlopen(search).read() 
    start = page.find('<p><b>') + 6 
    end = page.find('</a>.', start) + 5 
    new_page = page[start:end] 
    for a in new_page: 
     if a == '<': 
      if new_page[n - 1] != ' ': 
       lst = list(new_page) 
       lst.insert(n, ' ') 
       new_page = ''.join(lst) 
       n += 1 
     n += 1 
    return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))