在Python套接字缓存HTTP GET请求

问题描述：

我正在使用套接字制作代理服务器。当请求的文件不在我的当前目录（缓存）中时，我对源服务器（这是www）执行一个http get请求，并将其缓存以备后用。在Python套接字缓存HTTP GET请求

我的代码的问题是，每当我从www中获取资源时，我都会缓存它，但文件的内容总是“永久移动”。

所以这就是发生了什么：用户通过在浏览器中输入“localhost：8080/*.com”来请求“stackoverlflow.com”。浏览器将正确返回页面。当用户在浏览器中第二次输入“localhost：8080/*.com”时，浏览器将返回一个页面，说明*.com已永久移动。

下面是确实的HTTP GET请求，并缓存方法的代码：

@staticmethod 
    def find_on_www(conn, requested_file): 
     try: 
      # Create a socket on the proxy server 
      print 'Creating socket on proxy server' 
      c = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

      host_name = requested_file.replace("www.","",1) 
      print 'Host Name: ', host_name 

      # Connect to the socket to port 80 
      c.connect((host_name, 80)) 
      print 'Socket connected to port 80 of the host' 

      # Create a temporary file on this socket and ask port 80 
      # for the file requested by the client 
      file_object = c.makefile('r', 0) 
      file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n") 

      # Read the response into buffer 
      buff = file_object.readlines() 

      # Create a new file in the cache for the requested file. 
      # Also send the response in the buffer to client socket 
      # and the corresponding file in the cache 
      temp_file = open("./" + requested_file, "wb") 
      for i in range(0, len(buff)): 
       temp_file.write(buff[i]) 
       conn.send(buff[i]) 

      conn.close()

，这里是我的代码的其余部分，如果有人有兴趣：

import socket  # Socket programming 
import signal  # To shut down server on ctrl+c 
import time   # Current time 
import os   # To get the last-modified 
import mimetypes # To guess the type of requested file 
import sys   # To exit the program 
from threading import Thread 


def generate_header_lines(code, modified, length, mimetype): 
     """ Generates the header lines for the response message """ 
     h = '' 

     if code == 200: 
      # Append status code 
      h = 'HTTP/1.1 200 OK\n' 
      # Append the date 

      # Append the name of the server 
      h += 'Server: Proxy-Server-Thomas\n' 
      # Append the date of the last modification to the file 
      h += 'Last-Modified: ' + modified + '\n' 

     elif code == 404: 
      # Append the status code 
      h = 'HTTP/1.1 404 Not Found\n' 
      # Append the date 
      h += 'Date: ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + '\n' 
      # Append the name of the web server 
      h += 'Server: Web-Server-Thomas\n' 

     # Append the length of the content 
     h += 'Content-Length: ' + str(length) + '\n' 
     # Append the type of the content 
     h += 'Content-Type: ' + mimetype + '\n' 
     # Append the connection closed - let the client know we close the connection 
     h += 'Connection: close\n\n' 

     return h 


def get_mime_type(requested_file): 
    # Get the file's mimetype and encoding 
    try: 
     (mimetype, encoding) = mimetypes.guess_type(requested_file, True) 
     if not mimetype: 
      print "Mimetype found: text/html" 
      return 'text/html' 
     else: 
      print "Mimetype found: ", mimetype 
      return mimetype 

    except TypeError: 
     print "Mimetype found: text/html" 
     return 'text/html' 


class WebServer: 
    def __init__(self): 
     """ 
     Constructor 
     :return: 
     """ 
     self.host = ''  # Host for the server 
     self.port = 8000 # Port for the server 

     # Create socket 
     self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

    def start_server(self): 
     """ Starts the server 
     :return: 
     """ 
     # Bind the socket to the host and port 
     self.socket.bind((self.host, self.port)) 

     print "Connection started on ", self.port 

     # Start the main loop of the server - start handling clients 
     self.main_loop() 

    @staticmethod 
    def shutdown(): 
     """ Shuts down the server """ 
     try: 
      s.socket.close() 
     except Exception as e: 
      print "Something went wrong closing the socket: ", e 

    def main_loop(self): 
     """Main loop of the server""" 
     while True: 
      # Start listening 
      self.socket.listen(1) 

      # Wait for a client to connect 
      client_socket, client_address = self.socket.accept() 

      # Wait for a request from the client 
      data = client_socket.recv(1024) 

      t = Thread(target=self.handle_request, args=(client_socket, data)) 
      t.start() 

      # # Handle the request from the client 
      # self.handle_request(client_socket, data) 

    def handle_request(self, conn, data): 
     """ Handles a request from the client """ 
     # Decode the data 
     string = bytes.decode(data) 

     # Split the request 
     requested_file = string.split(' ') 
     # Get the method that is requested 
     request_method = requested_file[0] 

     if request_method == 'GET': 
      # Get the part of the request that contains the name 
      requested_file = requested_file[1] 
      # Get the name of the file from the request 
      requested_file = requested_file[1:] 

      print "Searching for: ", requested_file 

      try: 
       # Open the file 
       file_handler = open(requested_file, 'rb') 
       # Get the content of the file 
       response_content = file_handler.read() 
       # Close the handler 
       file_handler.close() 

       # Get information about the file from the OS 
       file_info = os.stat(requested_file) 
       # Extract the last modified time from the information 
       time_modified = time.ctime(file_info[8]) 
       # Get the time modified in seconds 
       modified_seconds = os.path.getctime(requested_file) 

       print "Current time: ", time.time() 
       print "Modified: ", time_modified 

       if (float(time.time()) - float(modified_seconds)) > 120: # more than 2 minutes 
        print "Time outdated!" 
        #self.find_on_www(conn, requested_file) 

       # Get the file's mimetype and encoding 
       mimetype = get_mime_type(requested_file) 

       print "Mimetype = ", mimetype 

       # Create the correct header lines 
       response_headers = generate_header_lines(200, time_modified, len(response_content), mimetype) 

       # Create the response to the request 
       server_response = response_headers.encode() + response_content 

       # Send the response back to the client 
       conn.send(server_response) 

       # Close the connection 
       conn.close() 

      except IOError: # Couldn't find the file in the cache - Go find file on www 
       print "Error: " + requested_file + " not found in cache!" 
       self.find_on_www(conn, requested_file) 

    @staticmethod 
    def find_on_www(conn, requested_file): 
     try: 
      # Create a socket on the proxy server 
      print 'Creating socket on proxy server' 
      c = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

      host_name = requested_file.replace("www.","",1) 
      print 'Host Name: ', host_name 

      # Connect to the socket to port 80 
      c.connect((host_name, 80)) 
      print 'Socket connected to port 80 of the host' 

      # Create a temporary file on this socket and ask port 80 
      # for the file requested by the client 
      file_object = c.makefile('r', 0) 
      file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n") 

      # Read the response into buffer 
      buff = file_object.readlines() 

      # Create a new file in the cache for the requested file. 
      # Also send the response in the buffer to client socket 
      # and the corresponding file in the cache 
      temp_file = open("./" + requested_file, "wb") 
      for i in range(0, len(buff)): 
       temp_file.write(buff[i]) 
       conn.send(buff[i]) 

      conn.close() 

     except Exception as e: 
      # Generate a body for the file - so we don't have an empty page 
      response_content = "<html><body><p>Error 404: File not found</p></body></html>" 

      # Generate the correct header lines 
      response_headers = generate_header_lines(404, '', len(response_content), 'text/html') 

      # Create the response to the request 
      server_response = response_headers.encode() + response_content 

      # Send the response back to the client 
      conn.send(server_response) 

      # Close the connection 
      conn.close() 


def shutdown_server(sig, dummy): 
    """ Shuts down the server """ 

    # Shutdown the server 
    s.shutdown() 

    # exit the program 
    sys.exit(1) 

# Shut down on ctrl+c 
signal.signal(signal.SIGINT, shutdown_server) 

# Create a web server 
s = WebServer() 
# Start the server 
s.start_server()

当我尝试使用Firefox 33时，我无法获得相同的结果。取而代之的是第二次尝试连接时，我询问是否希望下载页面，因为它认为它是Windows可执行文件。这是因为您的代码根据由扩展名确定的文件类型返回了mimetype，“.com”是Windows可执行文件。 – mpursuit 2014-12-07 13:29:01

正如你可以在我的get_mime_type函数中看到的那样，如果mimetypes库不能猜测MIME类型，我只会返回'text/html'。所以.com会返回'text/html'，它不应该认为它是可执行文件。但是，也许这不是正确的做法，你有什么建议吗？ – 2014-12-07 13:45:56

当你向实际的Web服务器发送请求时，我会读取请求头部返回的mimetype（在Content-Type中给出），将它存储在某个地方，然后在你从版本库中返回版本时重新创建头部时使用它缓存。 – mpursuit 2014-12-07 14:07:09

答

的你的代码的问题在于，如果你转到一个页面并返回一个301页面的状态代码，它会将它添加到页眉中。当您查看未存储在缓存中的页面时，将代理服务器直接发送的GET请求复制到客户端。这会通知客户端发出另一个GET请求，它会忽略您的代理服务器。

第二次尝试通过代理服务器请求页面时，它会从缓存中检索先前的请求。该文件包含前一个请求的头文件，该头文件正确包含重定向状态代码，然后您将自己的200 OK状态代码添加到返回的消息中。当客户端首先读取这个状态码时，它并没有意识到你希望它发出另一个请求来查找已被重定向的页面。因此它只显示告诉你页面已经移动的页面。

当代理服务器必须查看Internet上的实际页面时，您需要执行的操作是解析Web服务器返回的标头。然后根据这些服务器将正确的标题返回给客户端。

在Python套接字缓存HTTP GET请求

相关推荐