#!/usr/bin/env python # # Copyright 2009 Facebook # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. """Blocking and non-blocking HTTP client implementations using pycurl.""" from __future__ import with_statement import cStringIO import calendar import collections import email.utils import errno import httplib import logging import pycurl import sys import time import weakref from tornado import escape from tornado import httputil from tornado import ioloop from tornado import stack_context class HTTPClient(object): """A blocking HTTP client backed with pycurl. Typical usage looks like this: http_client = httpclient.HTTPClient() try: response = http_client.fetch("http://www.google.com/") print response.body except httpclient.HTTPError, e: print "Error:", e fetch() can take a string URL or an HTTPRequest instance, which offers more options, like executing POST/PUT/DELETE requests. """ def __init__(self, max_simultaneous_connections=None): self._curl = _curl_create(max_simultaneous_connections) def __del__(self): self._curl.close() def fetch(self, request, **kwargs): """Executes an HTTPRequest, returning an HTTPResponse. If an error occurs during the fetch, we raise an HTTPError. """ if not isinstance(request, HTTPRequest): request = HTTPRequest(url=request, **kwargs) buffer = cStringIO.StringIO() headers = httputil.HTTPHeaders() try: _curl_setup_request(self._curl, request, buffer, headers) self._curl.perform() code = self._curl.getinfo(pycurl.HTTP_CODE) effective_url = self._curl.getinfo(pycurl.EFFECTIVE_URL) buffer.seek(0) response = HTTPResponse( request=request, code=code, headers=headers, buffer=buffer, effective_url=effective_url) if code < 200 or code >= 300: raise HTTPError(code, response=response) return response except pycurl.error, e: buffer.close() raise CurlError(*e) class AsyncHTTPClient(object): """An non-blocking HTTP client backed with pycurl. Example usage: import ioloop def handle_request(response): if response.error: print "Error:", response.error else: print response.body ioloop.IOLoop.instance().stop() http_client = httpclient.AsyncHTTPClient() http_client.fetch("http://www.google.com/", handle_request) ioloop.IOLoop.instance().start() fetch() can take a string URL or an HTTPRequest instance, which offers more options, like executing POST/PUT/DELETE requests. The keyword argument max_clients to the AsyncHTTPClient constructor determines the maximum number of simultaneous fetch() operations that can execute in parallel on each IOLoop. """ _ASYNC_CLIENTS = weakref.WeakKeyDictionary() def __new__(cls, io_loop=None, max_clients=10, max_simultaneous_connections=None): # There is one client per IOLoop since they share curl instances io_loop = io_loop or ioloop.IOLoop.instance() if io_loop in cls._ASYNC_CLIENTS: return cls._ASYNC_CLIENTS[io_loop] else: instance = super(AsyncHTTPClient, cls).__new__(cls) instance.io_loop = io_loop instance._multi = pycurl.CurlMulti() instance._multi.setopt(pycurl.M_TIMERFUNCTION, instance._set_timeout) instance._multi.setopt(pycurl.M_SOCKETFUNCTION, instance._handle_socket) instance._curls = [_curl_create(max_simultaneous_connections) for i in xrange(max_clients)] instance._free_list = instance._curls[:] instance._requests = collections.deque() instance._fds = {} instance._timeout = None cls._ASYNC_CLIENTS[io_loop] = instance try: instance._socket_action = instance._multi.socket_action except AttributeError: # socket_action is found in pycurl since 7.18.2 (it's been # in libcurl longer than that but wasn't accessible to # python). logging.warning("socket_action method missing from pycurl; " "falling back to socket_all. Upgrading " "libcurl and pycurl will improve performance") instance._socket_action = \ lambda fd, action: instance._multi.socket_all() # libcurl has bugs that sometimes cause it to not report all # relevant file descriptors and timeouts to TIMERFUNCTION/ # SOCKETFUNCTION. Mitigate the effects of such bugs by # forcing a periodic scan of all active requests. instance._force_timeout_callback = ioloop.PeriodicCallback( instance._handle_force_timeout, 1000, io_loop=io_loop) instance._force_timeout_callback.start() return instance def close(self): """Destroys this http client, freeing any file descriptors used. Not needed in normal use, but may be helpful in unittests that create and destroy http clients. No other methods may be called on the AsyncHTTPClient after close(). """ del AsyncHTTPClient._ASYNC_CLIENTS[self.io_loop] self._force_timeout_callback.stop() for curl in self._curls: curl.close() self._multi.close() self._closed = True def fetch(self, request, callback, **kwargs): """Executes an HTTPRequest, calling callback with an HTTPResponse. If an error occurs during the fetch, the HTTPResponse given to the callback has a non-None error attribute that contains the exception encountered during the request. You can call response.reraise() to throw the exception (if any) in the callback. """ if not isinstance(request, HTTPRequest): request = HTTPRequest(url=request, **kwargs) self._requests.append((request, stack_context.wrap(callback))) self._process_queue() self._set_timeout(0) def _handle_socket(self, event, fd, multi, data): """Called by libcurl when it wants to change the file descriptors it cares about. """ event_map = { pycurl.POLL_NONE: ioloop.IOLoop.NONE, pycurl.POLL_IN: ioloop.IOLoop.READ, pycurl.POLL_OUT: ioloop.IOLoop.WRITE, pycurl.POLL_INOUT: ioloop.IOLoop.READ | ioloop.IOLoop.WRITE } if event == pycurl.POLL_REMOVE: self.io_loop.remove_handler(fd) del self._fds[fd] else: ioloop_event = event_map[event] if fd not in self._fds: self._fds[fd] = ioloop_event self.io_loop.add_handler(fd, self._handle_events, ioloop_event) else: self._fds[fd] = ioloop_event self.io_loop.update_handler(fd, ioloop_event) def _set_timeout(self, msecs): """Called by libcurl to schedule a timeout.""" if self._timeout is not None: self.io_loop.remove_timeout(self._timeout) self._timeout = self.io_loop.add_timeout( time.time() + msecs/1000.0, self._handle_timeout) def _handle_events(self, fd, events): """Called by IOLoop when there is activity on one of our file descriptors. """ action = 0 if events & ioloop.IOLoop.READ: action |= pycurl.CSELECT_IN if events & ioloop.IOLoop.WRITE: action |= pycurl.CSELECT_OUT while True: try: ret, num_handles = self._socket_action(fd, action) except pycurl.error, e: ret = e[0] if ret != pycurl.E_CALL_MULTI_PERFORM: break self._finish_pending_requests() def _handle_timeout(self): """Called by IOLoop when the requested timeout has passed.""" with stack_context.NullContext(): self._timeout = None while True: try: ret, num_handles = self._socket_action( pycurl.SOCKET_TIMEOUT, 0) except pycurl.error, e: ret = e[0] if ret != pycurl.E_CALL_MULTI_PERFORM: break self._finish_pending_requests() # In theory, we shouldn't have to do this because curl will # call _set_timeout whenever the timeout changes. However, # sometimes after _handle_timeout we will need to reschedule # immediately even though nothing has changed from curl's # perspective. This is because when socket_action is # called with SOCKET_TIMEOUT, libcurl decides internally which # timeouts need to be processed by using a monotonic clock # (where available) while tornado uses python's time.time() # to decide when timeouts have occurred. When those clocks # disagree on elapsed time (as they will whenever there is an # NTP adjustment), tornado might call _handle_timeout before # libcurl is ready. After each timeout, resync the scheduled # timeout with libcurl's current state. new_timeout = self._multi.timeout() if new_timeout != -1: self._set_timeout(new_timeout) def _handle_force_timeout(self): """Called by IOLoop periodically to ask libcurl to process any events it may have forgotten about. """ with stack_context.NullContext(): while True: try: ret, num_handles = self._multi.socket_all() except pycurl.error, e: ret = e[0] if ret != pycurl.E_CALL_MULTI_PERFORM: break self._finish_pending_requests() def _finish_pending_requests(self): """Process any requests that were completed by the last call to multi.socket_action. """ while True: num_q, ok_list, err_list = self._multi.info_read() for curl in ok_list: self._finish(curl) for curl, errnum, errmsg in err_list: self._finish(curl, errnum, errmsg) if num_q == 0: break self._process_queue() def _process_queue(self): with stack_context.NullContext(): while True: started = 0 while self._free_list and self._requests: started += 1 curl = self._free_list.pop() (request, callback) = self._requests.popleft() curl.info = { "headers": httputil.HTTPHeaders(), "buffer": cStringIO.StringIO(), "request": request, "callback": callback, "curl_start_time": time.time(), } # Disable IPv6 to mitigate the effects of this bug # on curl versions <= 7.21.0 # http://sourceforge.net/tracker/?func=detail&aid=3017819&group_id=976&atid=100976 if pycurl.version_info()[2] <= 0x71500: # 7.21.0 curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) _curl_setup_request(curl, request, curl.info["buffer"], curl.info["headers"]) self._multi.add_handle(curl) if not started: break def _finish(self, curl, curl_error=None, curl_message=None): info = curl.info curl.info = None self._multi.remove_handle(curl) self._free_list.append(curl) buffer = info["buffer"] if curl_error: error = CurlError(curl_error, curl_message) code = error.code effective_url = None buffer.close() buffer = None else: error = None code = curl.getinfo(pycurl.HTTP_CODE) effective_url = curl.getinfo(pycurl.EFFECTIVE_URL) buffer.seek(0) # the various curl timings are documented at # http://curl.haxx.se/libcurl/c/curl_easy_getinfo.html time_info = dict( queue=info["curl_start_time"] - info["request"].start_time, namelookup=curl.getinfo(pycurl.NAMELOOKUP_TIME), connect=curl.getinfo(pycurl.CONNECT_TIME), pretransfer=curl.getinfo(pycurl.PRETRANSFER_TIME), starttransfer=curl.getinfo(pycurl.STARTTRANSFER_TIME), total=curl.getinfo(pycurl.TOTAL_TIME), redirect=curl.getinfo(pycurl.REDIRECT_TIME), ) try: info["callback"](HTTPResponse( request=info["request"], code=code, headers=info["headers"], buffer=buffer, effective_url=effective_url, error=error, request_time=time.time() - info["curl_start_time"], time_info=time_info)) except (KeyboardInterrupt, SystemExit): raise except: logging.error("Exception in callback %r", info["callback"], exc_info=True) # For backwards compatibility: Tornado 1.0 included a new implementation of # AsyncHTTPClient that has since replaced the original. Define an alias # so anything that used AsyncHTTPClient2 still works AsyncHTTPClient2 = AsyncHTTPClient class HTTPRequest(object): def __init__(self, url, method="GET", headers=None, body=None, auth_username=None, auth_password=None, connect_timeout=20.0, request_timeout=20.0, if_modified_since=None, follow_redirects=True, max_redirects=5, user_agent=None, use_gzip=True, network_interface=None, streaming_callback=None, header_callback=None, prepare_curl_callback=None, proxy_host=None, proxy_port=None, proxy_username=None, proxy_password='', allow_nonstandard_methods=False): if headers is None: headers = httputil.HTTPHeaders() if if_modified_since: timestamp = calendar.timegm(if_modified_since.utctimetuple()) headers["If-Modified-Since"] = email.utils.formatdate( timestamp, localtime=False, usegmt=True) if "Pragma" not in headers: headers["Pragma"] = "" # Proxy support: proxy_host and proxy_port must be set to connect via # proxy. The username and password credentials are optional. self.proxy_host = proxy_host self.proxy_port = proxy_port self.proxy_username = proxy_username self.proxy_password = proxy_password # libcurl's magic "Expect: 100-continue" behavior causes delays # with servers that don't support it (which include, among others, # Google's OpenID endpoint). Additionally, this behavior has # a bug in conjunction with the curl_multi_socket_action API # (https://sourceforge.net/tracker/?func=detail&atid=100976&aid=3039744&group_id=976), # which increases the delays. It's more trouble than it's worth, # so just turn off the feature (yes, setting Expect: to an empty # value is the official way to disable this) if "Expect" not in headers: headers["Expect"] = "" self.url = _utf8(url) self.method = method self.headers = headers self.body = body self.auth_username = _utf8(auth_username) self.auth_password = _utf8(auth_password) self.connect_timeout = connect_timeout self.request_timeout = request_timeout self.follow_redirects = follow_redirects self.max_redirects = max_redirects self.user_agent = user_agent self.use_gzip = use_gzip self.network_interface = network_interface self.streaming_callback = streaming_callback self.header_callback = header_callback self.prepare_curl_callback = prepare_curl_callback self.allow_nonstandard_methods = allow_nonstandard_methods self.start_time = time.time() class HTTPResponse(object): """HTTP Response object. Attributes: * request: HTTPRequest object * code: numeric HTTP status code, e.g. 200 or 404 * headers: httputil.HTTPHeaders object * buffer: cStringIO object for response body * body: respose body as string (created on demand from self.buffer) * error: Exception object, if any * request_time: seconds from request start to finish * time_info: dictionary of diagnostic timing information from the request. Available data are subject to change, but currently uses timings available from http://curl.haxx.se/libcurl/c/curl_easy_getinfo.html, plus 'queue', which is the delay (if any) introduced by waiting for a slot under AsyncHTTPClient's max_clients setting. """ def __init__(self, request, code, headers={}, buffer=None, effective_url=None, error=None, request_time=None, time_info={}): self.request = request self.code = code self.headers = headers self.buffer = buffer self._body = None if effective_url is None: self.effective_url = request.url else: self.effective_url = effective_url if error is None: if self.code < 200 or self.code >= 300: self.error = HTTPError(self.code, response=self) else: self.error = None else: self.error = error self.request_time = request_time self.time_info = time_info def _get_body(self): if self.buffer is None: return None elif self._body is None: self._body = self.buffer.getvalue() return self._body body = property(_get_body) def rethrow(self): if self.error: raise self.error def __repr__(self): args = ",".join("%s=%r" % i for i in self.__dict__.iteritems()) return "%s(%s)" % (self.__class__.__name__, args) def __del__(self): if self.buffer is not None: self.buffer.close() class HTTPError(Exception): """Exception thrown for an unsuccessful HTTP request. Attributes: code - HTTP error integer error code, e.g. 404. Error code 599 is used when no HTTP response was received, e.g. for a timeout. response - HTTPResponse object, if any. Note that if follow_redirects is False, redirects become HTTPErrors, and you can look at error.response.headers['Location'] to see the destination of the redirect. """ def __init__(self, code, message=None, response=None): self.code = code message = message or httplib.responses.get(code, "Unknown") self.response = response Exception.__init__(self, "HTTP %d: %s" % (self.code, message)) class CurlError(HTTPError): def __init__(self, errno, message): HTTPError.__init__(self, 599, message) self.errno = errno def _curl_create(max_simultaneous_connections=None): curl = pycurl.Curl() if logging.getLogger().isEnabledFor(logging.DEBUG): curl.setopt(pycurl.VERBOSE, 1) curl.setopt(pycurl.DEBUGFUNCTION, _curl_debug) curl.setopt(pycurl.MAXCONNECTS, max_simultaneous_connections or 5) return curl def _curl_setup_request(curl, request, buffer, headers): curl.setopt(pycurl.URL, request.url) # Request headers may be either a regular dict or HTTPHeaders object if isinstance(request.headers, httputil.HTTPHeaders): curl.setopt(pycurl.HTTPHEADER, [_utf8("%s: %s" % i) for i in request.headers.get_all()]) else: curl.setopt(pycurl.HTTPHEADER, [_utf8("%s: %s" % i) for i in request.headers.iteritems()]) if request.header_callback: curl.setopt(pycurl.HEADERFUNCTION, request.header_callback) else: curl.setopt(pycurl.HEADERFUNCTION, lambda line: _curl_header_callback(headers, line)) if request.streaming_callback: curl.setopt(pycurl.WRITEFUNCTION, request.streaming_callback) else: curl.setopt(pycurl.WRITEFUNCTION, buffer.write) curl.setopt(pycurl.FOLLOWLOCATION, request.follow_redirects) curl.setopt(pycurl.MAXREDIRS, request.max_redirects) curl.setopt(pycurl.CONNECTTIMEOUT, int(request.connect_timeout)) curl.setopt(pycurl.TIMEOUT, int(request.request_timeout)) if request.user_agent: curl.setopt(pycurl.USERAGENT, _utf8(request.user_agent)) else: curl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)") if request.network_interface: curl.setopt(pycurl.INTERFACE, request.network_interface) if request.use_gzip: curl.setopt(pycurl.ENCODING, "gzip,deflate") else: curl.setopt(pycurl.ENCODING, "none") if request.proxy_host and request.proxy_port: curl.setopt(pycurl.PROXY, request.proxy_host) curl.setopt(pycurl.PROXYPORT, request.proxy_port) if request.proxy_username: credentials = '%s:%s' % (request.proxy_username, request.proxy_password) curl.setopt(pycurl.PROXYUSERPWD, credentials) # Set the request method through curl's retarded interface which makes # up names for almost every single method curl_options = { "GET": pycurl.HTTPGET, "POST": pycurl.POST, "PUT": pycurl.UPLOAD, "HEAD": pycurl.NOBODY, } custom_methods = set(["DELETE"]) for o in curl_options.values(): curl.setopt(o, False) if request.method in curl_options: curl.unsetopt(pycurl.CUSTOMREQUEST) curl.setopt(curl_options[request.method], True) elif request.allow_nonstandard_methods or request.method in custom_methods: curl.setopt(pycurl.CUSTOMREQUEST, request.method) else: raise KeyError('unknown method ' + request.method) # Handle curl's cryptic options for every individual HTTP method if request.method in ("POST", "PUT"): request_buffer = cStringIO.StringIO(escape.utf8(request.body)) curl.setopt(pycurl.READFUNCTION, request_buffer.read) if request.method == "POST": def ioctl(cmd): if cmd == curl.IOCMD_RESTARTREAD: request_buffer.seek(0) curl.setopt(pycurl.IOCTLFUNCTION, ioctl) curl.setopt(pycurl.POSTFIELDSIZE, len(request.body)) else: curl.setopt(pycurl.INFILESIZE, len(request.body)) if request.auth_username and request.auth_password: userpwd = "%s:%s" % (request.auth_username, request.auth_password) curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC) curl.setopt(pycurl.USERPWD, userpwd) logging.info("%s %s (username: %r)", request.method, request.url, request.auth_username) else: curl.unsetopt(pycurl.USERPWD) logging.info("%s %s", request.method, request.url) if request.prepare_curl_callback is not None: request.prepare_curl_callback(curl) def _curl_header_callback(headers, header_line): # header_line as returned by curl includes the end-of-line characters. header_line = header_line.strip() if header_line.startswith("HTTP/"): headers.clear() return if not header_line: return headers.parse_line(header_line) def _curl_debug(debug_type, debug_msg): debug_types = ('I', '<', '>', '<', '>') if debug_type == 0: logging.debug('%s', debug_msg.strip()) elif debug_type in (1, 2): for line in debug_msg.splitlines(): logging.debug('%s %s', debug_types[debug_type], line) elif debug_type == 4: logging.debug('%s %r', debug_types[debug_type], debug_msg) def _utf8(value): if value is None: return value if isinstance(value, unicode): return value.encode("utf-8") assert isinstance(value, str) return value def main(): from tornado.options import define, options, parse_command_line define("print_headers", type=bool, default=False) define("print_body", type=bool, default=True) define("follow_redirects", type=bool, default=True) args = parse_command_line() client = HTTPClient() for arg in args: try: response = client.fetch(arg, follow_redirects=options.follow_redirects) except HTTPError, e: if e.response is not None: response = e.response else: raise if options.print_headers: print response.headers if options.print_body: print response.body if __name__ == "__main__": main()