Changeset 3348

Show
Ignore:
Timestamp:
01/11/10 11:42:00 (10 years ago)
Author:
cemeyer
Message:

httpretrieve: More extensive changes, reorg in httpretrieve.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • seattle/trunk/seattlelib/httpretrieve.repy

    r3337 r3348  
    1717 
    1818 
    19  
    2019include urlparse.repy 
    2120include sockettimeout.repy 
    2221include urllib.repy 
    23  
    2422 
    2523 
     
    10098  """ 
    10199 
     100  # TODO: Make sure the timeout actually works correctly everywhere it 
     101  # should. I'm 99% sure it's broken somewhere. 
     102 
    102103  starttime = getruntime() 
    103104 
    104105  # Check if the URL is valid and get host, path, port and query 
    105   (host, port, path) = _httpretrieve_parse_given_url(url) 
     106  parsedurl = urlparse_urlsplit(url) 
     107  host = parsedurl['hostname'] 
     108  path = parsedurl['path'] 
     109  port = parsedurl.get('port', 80) 
     110 
     111  if parsedurl['scheme'] != 'http': 
     112    raise ValueError("URL doesn't seem to be for the HTTP protocol.") 
     113  if host is None: 
     114    raise ValueError("Missing hostname.") 
     115  if parsedurl['query'] is not None and urlparse['query'] != "": 
     116    raise ValueError("URL cannot include a query string.") 
    106117 
    107118  # Open connection to the web server 
     
    115126 
    116127  # build an HTTP request using the given port, host, path and query 
    117   httpheader = _httpretrieve_buildhttprequest(httpheaders, port, host, \ 
    118       path, querydata, postdata) 
     128  method = "GET" 
     129  if postdata is not None: 
     130    method = "POST" 
     131    if type(postdata) is dict: 
     132      postdata = urllib_quote_parameters(postdata) 
     133    if type(postdata) is not str: 
     134      raise TypeError("postdata should be a dict of form data or string") 
     135  else: 
     136    postdata = "" 
     137 
     138  if path == "": 
     139    path = "/" 
     140  if type(querydata) is dict: 
     141    querydata = urllib_quote_parameters(querydata) 
     142  if type(querydata) is str and querydata != "": 
     143    encquerydata = "?" + querydata 
     144  else: 
     145    encquerydata = "" 
     146 
     147  httpheader = method + ' ' + path + encquerydata + ' HTTP/1.0\r\n' 
     148  if "Host" not in httpheaders: 
     149    httpheader += "Host: " + host + ':' + port + "\r\n" 
     150  if httpheaders is not None: 
     151    if type(httpheaders) is not dict: 
     152      raise TypeError("Expected HTTP headers as a dictionary.") 
     153    else: 
     154      for key, val in httpheaders.items(): 
     155        httpheader += key + ": " + val + '\r\n' 
     156 
     157  if method == "POST": 
     158    httpheader += 'Content-Length: ' + str(len(postdata)) + '\r\n' 
     159  httpheader += '\r\n' 
     160  if method == "POST": 
     161    httpheader += postdata 
    119162 
    120163  # send HTTP request to the web server 
     
    122165 
    123166  # receive the header lines from the web server 
    124   httpheaderlines = _httpretrieve_receive_httpheader(sock, \ 
    125       timeout, getruntime() - starttime) 
     167  if timeout is None: 
     168    sock.settimeout(0) 
     169  elif getruntime() - starttime >= timeout: 
     170    raise SocketTimeoutError("Timed out") 
     171  else: 
     172    sock.settimeout(timeout - (getruntime() - starttime)) 
     173 
     174  headers_str = "" 
     175  while True: 
     176    headers_str += sock.recv(1) 
     177    if headers_str.endswith("\r\n\r\n"): 
     178      break 
     179 
     180  httpheaderlines = headers_str.split("\r\n") 
    126181 
    127182  # get the status code and status message from the HTTP response 
    128   (http_status_number, http_status_msg) = \ 
    129       _httpretrieve_get_httpstatuscode(httpheaderlines) 
    130  
    131   if http_status_number == '200': 
    132     contentlength = _httpretrieve_get_contentlength(httpheaderlines) 
    133     return _httpretrieve_filelikeobject(sock, contentlength) 
    134  
    135   elif http_status_number == '301' or http_status_number == '302': 
     183  statusline, httpheaderlines = httpheaderlines[0], httpheaderlines[1:] 
     184  headersplit = statusline.split(' ', 2) 
     185  if len(headersplit) < 3: 
     186    raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
     187  if headersplit[0] != 'HTTP': 
     188    raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
     189  statusmsg = headersplit[2] 
     190  try: 
     191    statusnum = int(headersplit[1]) 
     192  except ValueError, e: 
     193    raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
     194 
     195  responseheaders = _httpretrieve_parse_responseheaders(httpheaderlines) 
     196 
     197  if statusnum == 301 or statusnum == 302: 
    136198    # redirect to the new location via recursion 
    137199    sock.close() 
    138     redirect_location = _httpretrieve_httpredirect(httpheaderlines) 
    139     return httpretrieve_open(redirect_location) 
     200    try: 
     201      redirect_location = responseheaders["Location"][0] 
     202    except (KeyError, IndexError), ke: 
     203      raise HttpBrokenServerError("Server returned garbage for HTTP" + \ 
     204          " response. Redirect response missing Location header.") 
     205    else: 
     206      return httpretrieve_open(redirect_location) 
    140207 
    141208  else: 
    142     # Raise an exception detailing the status code and content of the 
    143     # page to the user. 
    144     contentlength = _httpretrieve_get_contentlength(httpheaderlines) 
    145     http_errorcontent = \ 
    146         _httpretrieve_receive_httperror_content(sock) 
     209    return _httpretrieve_filelikeobject(sock, responseheaders, \ 
     210        (headersplit[0], statusnum, statusmsg)) 
    147211 
    148212 
     
    184248  # Read from the file-like HTTP object into our file. 
    185249  while True: 
    186     httpcontent = http_obj.read(1024) 
     250    httpcontent = http_obj.read(4096) 
    187251    if httpcontent == '': 
    188252      # we're done reading 
     
    229293  # requests and retrieving responses. 
    230294 
    231   def __init__(self, sock, contentlength): 
    232     self.sock = sock 
    233     if contentlength == None: 
    234       self.contentlengthisknown = False 
    235     else: 
    236       self.contentlengthisknown = True 
    237       self.contentlength = contentlength 
    238     self.timeout = timeout 
    239     self.fileobjclosed = False 
    240     self.totalcontentisreceived = False 
    241     self.totalread = 0 
     295  def __init__(self, sock, headers, httpstatus): 
     296    self._sock = sock 
     297    self._fileobjclosed = False 
     298    self._totalcontentisreceived = False 
     299    self._totalread = 0 
     300    self.headers = headers 
     301    self.httpstatus = httpstatus 
    242302 
    243303 
     
    265325    """ 
    266326 
    267     if self.fileobjclosed == True: 
     327    if self._fileobjclosed == True: 
    268328      raise ValueError("I/O operation on closed file") 
    269329 
    270     if self.totalcontentisreceived: 
     330    if self._totalcontentisreceived: 
    271331      return '' 
    272332 
    273     if limit == None: 
    274       readhaslimit = False 
    275       left_to_read = 1024 
    276     else: 
     333    if limit is not None: 
    277334      # Sanity check type/value of limit 
    278335      if type(limit) is not int: 
    279         raise TypeError("Expected an integer for limit") 
     336        raise TypeError("Expected an integer or None for limit") 
    280337      elif limit < 0: 
    281338        raise ValueError("Expected a non-negative integer for limit") 
    282339 
    283       readhaslimit = True 
    284       left_to_read = limit 
     340      lefttoread = limit 
     341    else: 
     342      lefttoread = None 
    285343 
    286344    if timeout is None: 
    287       self.sock.settimeout(0) 
     345      self._sock.settimeout(0) 
    288346    else: 
    289       self.sock.settimeout(timeout) 
     347      self._sock.settimeout(timeout) 
    290348 
    291349    # Try to read up to limit, or until there is nothing left. 
    292350    httpcontent = '' 
    293351    while True: 
    294       content = self.sock.recv(left_to_read) 
    295  
     352      content = self._sock.recv(lefttoread or 4096) 
    296353      httpcontent += content 
    297       if readhaslimit: 
    298         self.totalread += len(content) 
    299         if len(content) == left_to_read: 
     354      self._totalread += len(content) 
     355      if limit is not None: 
     356        if len(content) == lefttoread: 
    300357          break 
    301358        else: 
    302           left_to_read -= len(content) 
     359          lefttoread -= len(content) 
     360      if content == "": 
     361        self._totalcontentisreceived = True 
     362        break 
    303363 
    304364    return httpcontent 
     
    323383      Nothing 
    324384    """ 
    325     self.fileobjclosed = True 
    326     self.sock.close() 
    327  
    328  
    329  
    330  
    331 def _httpretrieve_parse_given_url(url): 
    332   # Checks that the URL is in the right format and returns a tuple of host, 
    333   # port, path and query. 
    334   urlparse = urlparse_urlsplit(url) 
    335   if urlparse['scheme'] != 'http': 
    336     raise ValueError("URL doesn't seem to be for the HTTP protocol.") 
    337   if urlparse['hostname'] == None: 
    338     raise ValueError("Missing hostname.") 
    339   if urlparse['query'] is not None and urlparse['query'] != "": 
    340     raise ValueError("URL cannot include a query string.") 
    341  
    342   host = urlparse['hostname'] 
    343   path = urlparse['path'] 
    344   port = urlparse.get('port', 80) 
    345  
    346   return (host, port, path) 
    347  
    348  
    349  
    350  
    351 def _httpretrieve_buildhttprequest(httpheaders, port, host, path, \ 
    352     querydata, postdata): 
    353   # Builds the HTTP request. 
    354  
    355   if postdata != None: 
    356     # There is a posted data, use HTTP POST. 
    357  
    358     if type(postdata) is dict: 
    359       postdata = urllib_quote_parameters(postdata) 
    360  
    361     if type(postdata) is not str: 
    362       raise TypeError("postdata should be a dict of form data or string") 
    363  
    364     # Build the minimal HTTP request header -- includes only the request 
    365     # and the Host field. 
    366     httpheader = _httpretrieve_httprequestmain_header('POST', querydata, \ 
    367         path, host, port) 
    368  
    369     # Build the rest of the request. 
    370     httpheader += _httpretrieve_parse_clienthttpheader(httpheaders) 
    371     httpheader += 'Content-Length: ' + str(len(postdata)) + '\r\n' 
    372     httpheader += '\r\n' 
    373     httpheader += postdata 
    374  
    375   else: 
    376     # There is no posted data, use HTTP GET. 
    377     httpheader = _httpretrieve_httprequestmain_header('GET', querydata, \ 
    378         path, host, port) 
    379     httpheader += _httpretrieve_parse_clienthttpheader(httpheaders) 
    380     httpheader += '\r\n' 
    381  
    382   return httpheader 
    383  
    384  
    385  
    386  
    387 def _httpretrieve_httprequestmain_header(http_command, querydata, path, \ 
    388     host, port): 
    389   # Builds a minimal HTTP request, returning it as a string. 
    390  
    391   if type(querydata) is dict: 
    392     querydata = urllib_quote_parameters(querydata) 
    393  
    394   if type(querydata) is str and querydata != '': 
    395     encoded_query = '?' + url_query 
    396   else: 
    397     encoded_query = '' 
    398  
    399   # A non-empty path is a required part of an HTTP request. 
    400   addpath = '/' 
    401   if path != '': 
    402     addpath = path 
    403  
    404   main_httpheader = http_command + ' ' + addpath + encoded_query + \ 
    405       ' HTTP/1.0\r\n' 
    406  
    407   # We don't need to include the port in the Host header if it is 80. 
    408   addport = '' 
    409   if port != 80: 
    410     addport = ':' + str(port) 
    411  
    412   main_httpheader += 'Host: ' + host + addport + '\r\n' 
    413   return main_httpheader 
    414  
    415  
    416  
    417  
    418 def _httpretrieve_parse_clienthttpheader(httpheaders): 
    419   # Converts a dictionary of HTTP request headers into a string. 
    420  
    421   if httpheaders is None: 
    422     return '' 
    423  
    424   elif type(httpheaders) is not dict: 
    425     raise TypeError("Expected HTTP headers as a dictionary.") 
    426  
    427   else: 
    428     clienthttpheader = '' 
    429     for key, val in httpheaders.items(): 
    430       clienthttpheader += key + ': ' + val + '\r\n' 
    431     return clienthttpheader 
    432  
    433  
    434  
    435  
    436 def _httpretrieve_receive_httpheader(sock, timeout, currentruntime): 
    437   # Receives the HTTP headers only. Returns them as a list of strings. 
    438  
    439   if timeout is None: 
    440     sock.settimeout(0) 
    441   elif timeout - currentruntime <= 0: 
    442     raise SocketTimeoutError("Timed out") 
    443   else: 
    444     sock.settimeout(timeout - currentruntime) 
    445  
    446   httpheader_received = 0 
    447   httpheader = '' 
    448   while True: 
    449     # CRLFCRLF separates the HTTP headers from the body of the response. 
    450     if httpheader.endswith('\r\n\r\n'): 
    451       return httpheader.split('\r\n') 
    452  
    453     content = sock.recv(1) 
    454     httpheader_received += 1 
    455     httpheader += content 
    456  
    457  
    458  
    459  
    460 def _httpretrieve_get_httpstatuscode(httpHeaderLines): 
    461   # Checks if the status code does not indicate an error. 
    462  
    463   # The first line of an HTTP response is composed of: 
    464   # HTTP<version> http_status_number http_status_msg 
    465   httpstatusheader = httpHeaderLines[0] 
    466   headersplit = httpstatusheader.split(' ', 2) 
    467  
    468   if len(headersplit) != 3: 
    469     raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
    470   if not httpstatusheader.startswith('HTTP'): 
    471     raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
    472  
    473   http_status_msg = headersplit[2] 
    474  
     385    self._fileobjclosed = True 
     386    self._sock.close() 
     387 
     388 
     389 
     390 
     391def _httpretrieve_parse_responseheaders(headerlines): 
     392  # Parse rfc822-style headers (this could be abstracted out to an rfc822 
     393  # library that would be quite useful for internet protocols). Returns 
     394  # a dictionary mapping headers to arrays of values. E.g.: 
     395  # 
     396  # Foo: a 
     397  # Bar: 
     398  #   b 
     399  # Bar: c 
     400  # 
     401  # Becomes: {"Foo": ["a"], "Bar": ["b", "c"]} 
     402 
     403  i = 0 
     404  lastheader = None 
     405  lastheader_str = "" 
     406  res = {} 
    475407  try: 
    476     return (int(headersplit[1]), http_status_msg) 
    477   except ValueError, e: 
    478     raise HttpBrokenServerError("Server returned garbage for HTTP response.") 
    479  
    480  
    481  
    482  
    483 def _httpretrieve_receive_httperror_content(sock): 
    484   # Receive the error message (this is called when the server returns an 
    485   # 'error' response). 
    486  
    487   httperror_content = '' 
    488   while True: 
    489     content = sock.recv(1024) 
    490     httperror_content += content 
    491  
    492   return httperror_content 
    493  
    494  
    495  
    496  
    497 def _httpretrieve_httpredirect(httpheaderlines): 
    498   # Determine redirect location from response headers. 
    499  
    500   for headerline in httpheaderlines: 
    501     if headerline.startswith('Location: '): 
    502       return headerline[len('Location: '):] 
    503  
    504   raise HttpBrokenServerError("HTTP server indicated a redirect and did " + \ 
    505       "not send a Location header.") 
    506  
    507  
    508  
    509  
    510 def _httpretrieve_get_contentlength(httpheaderlines): 
    511   # Determines the value of the Content-Length header. 
    512  
    513   for headerline in httpheaderlines: 
    514     if headerline.startswith('Content-Length: '): 
    515       return int(headerline[len('Content-Length: '):]) 
    516  
    517   return None 
     408    while True: 
     409      # non-CRLF whitespace characters 
     410      if headerlines[i][0] in (" ", "\t") and lastheader is not None: 
     411        lastheader_str += headerlines[i] 
     412      else: 
     413        if lastheader is not None: 
     414          if lastheader not in res: 
     415            res[lastheader] = [] 
     416          res[lastheader].append(lastheader_str.strip()) 
     417        lastheader, lastheader_str = headerlines[i].split(":") 
     418      i += 1 
     419      if i >= len(headerlines): 
     420        break 
     421    return res 
     422  except IndexError, idx: 
     423    raise HttpBrokenServerError("Server returned garbage for HTTP" + \ 
     424        " response. Bad header.")