[Twisted-web] How can I change the HTTP request to avoid gzip
Jean-Paul Calderone
exarkun at divmod.com
Tue Dec 16 07:20:56 EST 2008
On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <radudragusin at gmail.com> wrote:
>I have a HTTP Proxy made with twisted.web and want to change the request
>that the browser sends to the Proxy such that I erase the value of the
>'accept-encoding' key from 'gzip,deflate' to ' '.
>
>I use the example from the Tisted Book:
>
>By adding the overriden process method in WordCountProxyRequest I can get
>the request header but have found no way to set a key, value pair.
>I want make the server think that the browser does not support gzip because
>twisted seems to not support gzip as the response from www.google.com and
>many (but not all) sites appears still encoded. www.dpreview.com seems not
>to gzip the response, and so the resonse is processed correctly.
>
>What can I do to either correctly decode gzip responses or modify the
>'accept-encoding' value to nothing so the server does not compress the
>response?
>
>Thank you!
>*Example 4-8. wordcountproxy.py*
>
>import sgmllib, re
>from twisted.web import proxy, http
>import sys
>from twisted.python import log
>log.startLogging(sys.stdout)
>
>WEB_PORT = 8000
>PROXY_PORT = 8001
>
>class WordParser(sgmllib.SGMLParser):
> def __init__(self):
> sgmllib.SGMLParser.__init__(self)
> self.chardata = []
> self.inBody = False
>
> def start_body(self, attrs):
> self.inBody = True
>
> def end_body(self):
> self.inBody = False
>
> def handle_data(self, data):
> if self.inBody:
> self.chardata.append(data)
>
> def getWords(self):
> # extract words
> wordFinder = re.compile(r'\w*')
> words = wordFinder.findall("".join(self.chardata))
> words = filter(lambda word: word.strip( ), words)
> print "WORDS ARE", words
> return words
>
>class WordCounter(object):
> ignoredWords = "the a of in from to this that and or but is was be
>can could i you they we at".split( )
>
> def __init__(self):
> self.words = {}
>
> def addWords(self, words):
> for word in words:
> word = word.lower( )
> if not word in self.ignoredWords:
> currentCount = self.words.get(word, 0)
> self.words[word] = currentCount + 1
>
>class WordCountProxyClient(proxy.ProxyClient):
> def handleHeader(self, key, value):
> proxy.ProxyClient.handleHeader(self, key, value)
How about skipping it here?
> if key.lower( ) == "content-type":
> if value.split(';')[0] == 'text/html':
> self.parser = WordParser( )
>
> def handleResponsePart(self, data):
> proxy.ProxyClient.handleResponsePart(self, data)
> if hasattr(self, 'parser'): self.parser.feed(data)
>
>
> def handleResponseEnd(self):
> proxy.ProxyClient.handleResponseEnd(self)
> if hasattr(self, 'parser'):
> self.parser.close( )
> self.father.wordCounter.addWords(self.parser.getWords( ))
> del(self.parser)
>
>class WordCountProxyClientFactory(proxy.ProxyClientFactory):
> def buildProtocol(self, addr):
> client = proxy.ProxyClientFactory.buildProtocol(self, addr)
> # upgrade proxy.proxyClient object to WordCountProxyClient
> client.__class__ = WordCountProxyClient
> return client
>
>class WordCountProxyRequest(proxy.ProxyRequest):
> protocols = {'http': WordCountProxyClientFactory}
>
> def __init__(self, wordCounter, *args):
> self.wordCounter = wordCounter
> proxy.ProxyRequest.__init__(self, *args)
>
>* def process(self):
> proxy.ProxyRequest.process(self)
> print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
>
>class WordCountProxy(proxy.Proxy):
> def __init__(self, wordCounter):
> self.wordCounter = wordCounter
> proxy.Proxy.__init__(self)
>
> def requestFactory(self, *args):
> return WordCountProxyRequest(self.wordCounter, *args)
>
>class WordCountProxyFactory(http.HTTPFactory):
> def __init__(self, wordCounter):
> self.wordCounter = wordCounter
> http.HTTPFactory.__init__(self)
>
> def buildProtocol(self, addr):
> protocol = WordCountProxy(self.wordCounter)
> return protocol
>
># classes for web reporting interface
>class WebReportRequest(http.Request):
> def __init__(self, wordCounter, *args):
> self.wordCounter = wordCounter
> http.Request.__init__(self, *args)
>
> def process(self):
> self.setHeader("Content-Type", "text/html")
> words = self.wordCounter.words.items( )
> words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1))
> for word, count in words:
> self.write("<li>%s %s</li>" % (word, count))
> self.finish( )
>
>class WebReportChannel(http.HTTPChannel):
> def __init__(self, wordCounter):
> self.wordCounter = wordCounter
> http.HTTPChannel.__init__(self)
>
> def requestFactory(self, *args):
> return WebReportRequest(self.wordCounter, *args)
>
>class WebReportFactory(http.HTTPFactory):
> def __init__(self, wordCounter):
> self.wordCounter = wordCounter
> http.HTTPFactory.__init__(self)
>
> def buildProtocol(self, addr):
> return WebReportChannel(self.wordCounter)
>
>if __name__ == "__main__":
> from twisted.internet import reactor
> counter = WordCounter( )
> prox = WordCountProxyFactory(counter)
> reactor.listenTCP(PROXY_PORT, prox)
> reactor.listenTCP(WEB_PORT, WebReportFactory(counter))
> reactor.run( )
>
>
Jean-Paul
More information about the Twisted-web
mailing list