在Python套接字缓存HTTP GET请求
问题描述:
我正在使用套接字制作代理服务器。当请求的文件不在我的当前目录(缓存)中时,我对源服务器(这是www)执行一个http get请求,并将其缓存以备后用。在Python套接字缓存HTTP GET请求
我的代码的问题是,每当我从www中获取资源时,我都会缓存它,但文件的内容总是“永久移动”。
所以这就是发生了什么:用户通过在浏览器中输入“localhost:8080/*.com”来请求“stackoverlflow.com”。浏览器将正确返回页面。当用户在浏览器中第二次输入“localhost:8080/*.com”时,浏览器将返回一个页面,说明*.com已永久移动。
下面是确实的HTTP GET请求,并缓存方法的代码:
@staticmethod
def find_on_www(conn, requested_file):
try:
# Create a socket on the proxy server
print 'Creating socket on proxy server'
c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host_name = requested_file.replace("www.","",1)
print 'Host Name: ', host_name
# Connect to the socket to port 80
c.connect((host_name, 80))
print 'Socket connected to port 80 of the host'
# Create a temporary file on this socket and ask port 80
# for the file requested by the client
file_object = c.makefile('r', 0)
file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")
# Read the response into buffer
buff = file_object.readlines()
# Create a new file in the cache for the requested file.
# Also send the response in the buffer to client socket
# and the corresponding file in the cache
temp_file = open("./" + requested_file, "wb")
for i in range(0, len(buff)):
temp_file.write(buff[i])
conn.send(buff[i])
conn.close()
,这里是我的代码的其余部分,如果有人有兴趣:
import socket # Socket programming
import signal # To shut down server on ctrl+c
import time # Current time
import os # To get the last-modified
import mimetypes # To guess the type of requested file
import sys # To exit the program
from threading import Thread
def generate_header_lines(code, modified, length, mimetype):
""" Generates the header lines for the response message """
h = ''
if code == 200:
# Append status code
h = 'HTTP/1.1 200 OK\n'
# Append the date
# Append the name of the server
h += 'Server: Proxy-Server-Thomas\n'
# Append the date of the last modification to the file
h += 'Last-Modified: ' + modified + '\n'
elif code == 404:
# Append the status code
h = 'HTTP/1.1 404 Not Found\n'
# Append the date
h += 'Date: ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + '\n'
# Append the name of the web server
h += 'Server: Web-Server-Thomas\n'
# Append the length of the content
h += 'Content-Length: ' + str(length) + '\n'
# Append the type of the content
h += 'Content-Type: ' + mimetype + '\n'
# Append the connection closed - let the client know we close the connection
h += 'Connection: close\n\n'
return h
def get_mime_type(requested_file):
# Get the file's mimetype and encoding
try:
(mimetype, encoding) = mimetypes.guess_type(requested_file, True)
if not mimetype:
print "Mimetype found: text/html"
return 'text/html'
else:
print "Mimetype found: ", mimetype
return mimetype
except TypeError:
print "Mimetype found: text/html"
return 'text/html'
class WebServer:
def __init__(self):
"""
Constructor
:return:
"""
self.host = '' # Host for the server
self.port = 8000 # Port for the server
# Create socket
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
def start_server(self):
""" Starts the server
:return:
"""
# Bind the socket to the host and port
self.socket.bind((self.host, self.port))
print "Connection started on ", self.port
# Start the main loop of the server - start handling clients
self.main_loop()
@staticmethod
def shutdown():
""" Shuts down the server """
try:
s.socket.close()
except Exception as e:
print "Something went wrong closing the socket: ", e
def main_loop(self):
"""Main loop of the server"""
while True:
# Start listening
self.socket.listen(1)
# Wait for a client to connect
client_socket, client_address = self.socket.accept()
# Wait for a request from the client
data = client_socket.recv(1024)
t = Thread(target=self.handle_request, args=(client_socket, data))
t.start()
# # Handle the request from the client
# self.handle_request(client_socket, data)
def handle_request(self, conn, data):
""" Handles a request from the client """
# Decode the data
string = bytes.decode(data)
# Split the request
requested_file = string.split(' ')
# Get the method that is requested
request_method = requested_file[0]
if request_method == 'GET':
# Get the part of the request that contains the name
requested_file = requested_file[1]
# Get the name of the file from the request
requested_file = requested_file[1:]
print "Searching for: ", requested_file
try:
# Open the file
file_handler = open(requested_file, 'rb')
# Get the content of the file
response_content = file_handler.read()
# Close the handler
file_handler.close()
# Get information about the file from the OS
file_info = os.stat(requested_file)
# Extract the last modified time from the information
time_modified = time.ctime(file_info[8])
# Get the time modified in seconds
modified_seconds = os.path.getctime(requested_file)
print "Current time: ", time.time()
print "Modified: ", time_modified
if (float(time.time()) - float(modified_seconds)) > 120: # more than 2 minutes
print "Time outdated!"
#self.find_on_www(conn, requested_file)
# Get the file's mimetype and encoding
mimetype = get_mime_type(requested_file)
print "Mimetype = ", mimetype
# Create the correct header lines
response_headers = generate_header_lines(200, time_modified, len(response_content), mimetype)
# Create the response to the request
server_response = response_headers.encode() + response_content
# Send the response back to the client
conn.send(server_response)
# Close the connection
conn.close()
except IOError: # Couldn't find the file in the cache - Go find file on www
print "Error: " + requested_file + " not found in cache!"
self.find_on_www(conn, requested_file)
@staticmethod
def find_on_www(conn, requested_file):
try:
# Create a socket on the proxy server
print 'Creating socket on proxy server'
c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host_name = requested_file.replace("www.","",1)
print 'Host Name: ', host_name
# Connect to the socket to port 80
c.connect((host_name, 80))
print 'Socket connected to port 80 of the host'
# Create a temporary file on this socket and ask port 80
# for the file requested by the client
file_object = c.makefile('r', 0)
file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")
# Read the response into buffer
buff = file_object.readlines()
# Create a new file in the cache for the requested file.
# Also send the response in the buffer to client socket
# and the corresponding file in the cache
temp_file = open("./" + requested_file, "wb")
for i in range(0, len(buff)):
temp_file.write(buff[i])
conn.send(buff[i])
conn.close()
except Exception as e:
# Generate a body for the file - so we don't have an empty page
response_content = "<html><body><p>Error 404: File not found</p></body></html>"
# Generate the correct header lines
response_headers = generate_header_lines(404, '', len(response_content), 'text/html')
# Create the response to the request
server_response = response_headers.encode() + response_content
# Send the response back to the client
conn.send(server_response)
# Close the connection
conn.close()
def shutdown_server(sig, dummy):
""" Shuts down the server """
# Shutdown the server
s.shutdown()
# exit the program
sys.exit(1)
# Shut down on ctrl+c
signal.signal(signal.SIGINT, shutdown_server)
# Create a web server
s = WebServer()
# Start the server
s.start_server()
答
的你的代码的问题在于,如果你转到一个页面并返回一个301页面的状态代码,它会将它添加到页眉中。当您查看未存储在缓存中的页面时,将代理服务器直接发送的GET请求复制到客户端。这会通知客户端发出另一个GET请求,它会忽略您的代理服务器。
第二次尝试通过代理服务器请求页面时,它会从缓存中检索先前的请求。该文件包含前一个请求的头文件,该头文件正确包含重定向状态代码,然后您将自己的200 OK状态代码添加到返回的消息中。当客户端首先读取这个状态码时,它并没有意识到你希望它发出另一个请求来查找已被重定向的页面。因此它只显示告诉你页面已经移动的页面。
当代理服务器必须查看Internet上的实际页面时,您需要执行的操作是解析Web服务器返回的标头。然后根据这些服务器将正确的标题返回给客户端。
当我尝试使用Firefox 33时,我无法获得相同的结果。取而代之的是第二次尝试连接时,我询问是否希望下载页面,因为它认为它是Windows可执行文件。 这是因为您的代码根据由扩展名确定的文件类型返回了mimetype,“.com”是Windows可执行文件。 – mpursuit 2014-12-07 13:29:01
正如你可以在我的get_mime_type函数中看到的那样,如果mimetypes库不能猜测MIME类型,我只会返回'text/html'。所以.com会返回'text/html',它不应该认为它是可执行文件。但是,也许这不是正确的做法,你有什么建议吗? – 2014-12-07 13:45:56
当你向实际的Web服务器发送请求时,我会读取请求头部返回的mimetype(在Content-Type中给出),将它存储在某个地方,然后在你从版本库中返回版本时重新创建头部时使用它缓存。 – mpursuit 2014-12-07 14:07:09