[Twisted-web] How can I change the HTTP request to avoid gzip

Tue Dec 16 07:20:56 EST 2008

On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <radudragusin at gmail.com> wrote:
>I have a HTTP Proxy made with twisted.web and want to change the request
>that the browser sends to the Proxy such that I erase the value of the
>'accept-encoding' key from 'gzip,deflate' to ' '.
>
>I use the example from the Tisted Book:
>
>By adding the overriden process method in WordCountProxyRequest I can get
>the request header but have found no way to set a key, value pair.
>I want make the server think that the browser does not support gzip because
>twisted seems to not support gzip as the response from www.google.com and
>many (but not all) sites appears still encoded. www.dpreview.com seems not
>to gzip the response, and so the resonse is processed correctly.
>
>What can I do to either correctly decode gzip responses or modify the
>'accept-encoding' value to nothing so the server does not compress the
>response?
>
>Thank you!
>*Example 4-8. wordcountproxy.py*
>
>import sgmllib, re
>from twisted.web import proxy, http
>import sys
>from twisted.python import log
>log.startLogging(sys.stdout)
>
>WEB_PORT = 8000
>PROXY_PORT = 8001
>
>class WordParser(sgmllib.SGMLParser):
>    def __init__(self):
>        sgmllib.SGMLParser.__init__(self)
>        self.chardata = []
>        self.inBody = False
>
>    def start_body(self, attrs):
>        self.inBody = True
>
>    def end_body(self):
>        self.inBody = False
>
>    def handle_data(self, data):
>        if self.inBody:
>            self.chardata.append(data)
>
>    def getWords(self):
>        # extract words
>        wordFinder = re.compile(r'\w*')
>        words = wordFinder.findall("".join(self.chardata))
>        words = filter(lambda word: word.strip( ), words)
>        print "WORDS ARE", words
>        return words
>
>class WordCounter(object):
>    ignoredWords = "the a of in from to this that and or but is was be
>can could i you they we at".split( )
>
>    def __init__(self):
>        self.words = {}
>
>    def addWords(self, words):
>        for word in words:
>            word = word.lower( )
>            if not word in self.ignoredWords:
>                currentCount = self.words.get(word, 0)
>                self.words[word] = currentCount + 1
>
>class WordCountProxyClient(proxy.ProxyClient):
>    def handleHeader(self, key, value):
>        proxy.ProxyClient.handleHeader(self, key, value)

How about skipping it here?

>        if key.lower( ) == "content-type":
>            if value.split(';')[0] == 'text/html':
>                self.parser = WordParser( )
>
>    def handleResponsePart(self, data):
>        proxy.ProxyClient.handleResponsePart(self, data)
>        if hasattr(self, 'parser'): self.parser.feed(data)
>
>
>    def handleResponseEnd(self):
>        proxy.ProxyClient.handleResponseEnd(self)
>        if hasattr(self, 'parser'):
>            self.parser.close( )
>            self.father.wordCounter.addWords(self.parser.getWords( ))
>            del(self.parser)
>
>class WordCountProxyClientFactory(proxy.ProxyClientFactory):
>    def buildProtocol(self, addr):
>        client = proxy.ProxyClientFactory.buildProtocol(self, addr)
>        # upgrade proxy.proxyClient object to WordCountProxyClient
>        client.__class__ = WordCountProxyClient
>        return client
>
>class WordCountProxyRequest(proxy.ProxyRequest):
>    protocols = {'http': WordCountProxyClientFactory}
>
>    def __init__(self, wordCounter, *args):
>        self.wordCounter = wordCounter
>        proxy.ProxyRequest.__init__(self, *args)
>
>*    def process(self):
>        proxy.ProxyRequest.process(self)
>        print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
>
>class WordCountProxy(proxy.Proxy):
>    def __init__(self, wordCounter):
>        self.wordCounter = wordCounter
>        proxy.Proxy.__init__(self)
>
>    def requestFactory(self, *args):
>        return WordCountProxyRequest(self.wordCounter, *args)
>
>class WordCountProxyFactory(http.HTTPFactory):
>    def __init__(self, wordCounter):
>        self.wordCounter = wordCounter
>        http.HTTPFactory.__init__(self)
>
>    def buildProtocol(self, addr):
>        protocol = WordCountProxy(self.wordCounter)
>        return protocol
>
># classes for web reporting interface
>class WebReportRequest(http.Request):
>    def __init__(self, wordCounter, *args):
>        self.wordCounter = wordCounter
>        http.Request.__init__(self, *args)
>
>    def process(self):
>        self.setHeader("Content-Type", "text/html")
>        words = self.wordCounter.words.items( )
>        words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1))
>        for word, count in words:
>            self.write("<li>%s %s</li>" % (word, count))
>        self.finish( )
>
>class WebReportChannel(http.HTTPChannel):
>    def __init__(self, wordCounter):
>        self.wordCounter = wordCounter
>        http.HTTPChannel.__init__(self)
>
>    def requestFactory(self, *args):
>        return WebReportRequest(self.wordCounter, *args)
>
>class WebReportFactory(http.HTTPFactory):
>    def __init__(self, wordCounter):
>        self.wordCounter = wordCounter
>        http.HTTPFactory.__init__(self)
>
>    def buildProtocol(self, addr):
>        return WebReportChannel(self.wordCounter)
>
>if __name__ == "__main__":
>    from twisted.internet import reactor
>    counter = WordCounter( )
>    prox = WordCountProxyFactory(counter)
>    reactor.listenTCP(PROXY_PORT, prox)
>    reactor.listenTCP(WEB_PORT, WebReportFactory(counter))
>    reactor.run( )
>
>

Jean-Paul