httpio_bbc
1from __future__ import absolute_import 2 3import re 4import requests 5 6from io import BufferedIOBase 7 8from six import PY3 9from sys import version_info 10 11__all__ = ["open", "HTTPIOError", "HTTPIOFile"] 12 13 14# The expected exception from unimplemented IOBase operations 15IOBaseError = OSError if PY3 else IOError 16 17 18def open(url, block_size=-1, **kwargs): 19 """ 20 Open a URL as a file-like object 21 22 :param url: The URL of the file to open 23 :param block_size: The cache block size, or `-1` to disable caching. 24 :param kwargs: Additional arguments to pass to `requests.Request()` 25 :return: An `httpio.HTTPIOFile` object supporting most of the usual 26 file-like object methods. 27 """ 28 f = HTTPIOFile(url, block_size, **kwargs) 29 f.open() 30 return f 31 32 33class HTTPIOError(IOBaseError): 34 pass 35 36 37class SyncHTTPIOFile(BufferedIOBase): 38 def __init__(self, url, block_size=-1, no_head_request=False, **kwargs): 39 super(SyncHTTPIOFile, self).__init__() 40 self.url = url 41 self.block_size = block_size 42 self.no_head_request = no_head_request 43 44 self._kwargs = kwargs 45 self._cursor = 0 46 self._cache = {} 47 self._session = None 48 49 self.length = None 50 51 self._closing = False 52 53 def __repr__(self): 54 status = "closed" if self.closed else "open" 55 return "<%s %s %r at %s>" % (status, type(self).__name__, self.url, hex(id(self))) 56 57 def __enter__(self): 58 self.open() 59 return super(SyncHTTPIOFile, self).__enter__() 60 61 def _check_ranges_set_length(self, response): 62 try: 63 self.length = int(response.headers['Content-Length']) 64 except KeyError: 65 raise HTTPIOError("Server does not report content length") 66 if response.headers.get('Accept-Ranges', '').lower() != 'bytes': 67 raise HTTPIOError("Server does not accept 'Range' headers") 68 69 def _check_file_headers_set_length(self, getter): 70 pass 71 72 def open(self): 73 self._assert_not_closed() 74 if not self._closing and self._session is None: 75 self._session = requests.Session() 76 77 if not self.no_head_request: 78 response = self._session.head(self.url, **self._kwargs) 79 80 # In some cases, notably including AWS S3 presigned URLs, it's only possible to GET the URL and HEAD 81 # isn't supported. In these cases we skip raising an exception and fall through to the `no_head_request` 82 # behaviour instead 83 if response.status_code != 405 and response.status_code != 403: 84 response.raise_for_status() 85 self._check_ranges_set_length(response) 86 return 87 88 # GET the URL with stream=True to avoid downloading the full response: exiting the context manager will 89 # close the connection 90 with self._session.get(self.url, stream=True, **self._kwargs) as response: 91 response.raise_for_status() 92 self._check_ranges_set_length(response) 93 94 def close(self): 95 self._closing = True 96 self._cache.clear() 97 if self._session is not None: 98 self._session.close() 99 super(SyncHTTPIOFile, self).close() 100 101 def flush(self): 102 self._assert_not_closed() 103 self.open() 104 self._cache.clear() 105 106 def peek(self, size=-1): 107 loc = self.tell() 108 data = self.read1(size) 109 self.seek(loc) 110 111 return data 112 113 def read(self, size=-1): 114 return self._read_impl(size) 115 116 def read1(self, size=-1): 117 return self._read_impl(size, 1) 118 119 def readable(self): 120 return True 121 122 def readinto(self, b): 123 return self._readinto_impl(b) 124 125 def readinto1(self, b): 126 return self._readinto_impl(b, 1) 127 128 def seek(self, offset, whence=0): 129 self._assert_not_closed() 130 if whence == 0: 131 self._cursor = offset 132 elif whence == 1: 133 self._cursor += offset 134 elif whence == 2: 135 self._cursor = self.length + offset 136 else: 137 raise HTTPIOError("Invalid argument: whence=%r" % whence) 138 if not (0 <= self._cursor <= self.length): 139 raise HTTPIOError("Invalid argument: cursor=%r" % self._cursor) 140 return self._cursor 141 142 def seekable(self): 143 return True 144 145 def tell(self): 146 self._assert_not_closed() 147 self.open() 148 return self._cursor 149 150 def write(self, *args, **kwargs): 151 raise HTTPIOError("Writing not supported on http resource") 152 153 def _read_impl(self, size=-1, max_raw_reads=-1): 154 self._assert_not_closed() 155 self.open() 156 157 if size < 1 or self._cursor + size > self.length: 158 size = self.length - self._cursor 159 160 if size == 0: 161 return b"" 162 163 if self.block_size <= 0: 164 data = self._read_raw(self._cursor, self._cursor + size) 165 166 else: 167 data = b''.join(self._read_cached(size, 168 max_raw_reads=max_raw_reads)) 169 170 self._cursor += len(data) 171 return data 172 173 def _readinto_impl(self, b, max_raw_reads=-1): 174 self._assert_not_closed() 175 self.open() 176 177 size = len(b) 178 179 if self._cursor + size > self.length: 180 size = self.length - self._cursor 181 182 if size == 0: 183 return 0 184 185 if self.block_size <= 0: 186 b[:size] = self._read_raw(self._cursor, self._cursor + size) 187 return size 188 189 else: 190 n = 0 191 for sector in self._read_cached(size, 192 max_raw_reads=max_raw_reads): 193 b[n:n+len(sector)] = sector 194 n += len(sector) 195 196 return n 197 198 def _read_cached(self, size, max_raw_reads=-1): 199 sector0, offset0 = divmod(self._cursor, self.block_size) 200 sector1, offset1 = divmod(self._cursor + size - 1, self.block_size) 201 offset1 += 1 202 sector1 += 1 203 204 # Fetch any sectors missing from the cache 205 status = "".join(str(int(idx in self._cache)) 206 for idx in range(sector0, sector1)) 207 raw_reads = 0 208 for match in re.finditer("0+", status): 209 if max_raw_reads >= 0 and raw_reads >= max_raw_reads: 210 break 211 212 data = self._read_raw( 213 self.block_size * (sector0 + match.start()), 214 self.block_size * (sector0 + match.end())) 215 raw_reads += 1 216 217 for idx in range(match.end() - match.start()): 218 self._cache[sector0 + idx + match.start()] = data[ 219 self.block_size * idx: 220 self.block_size * (idx + 1)] 221 222 data = [] 223 for idx in range(sector0, sector1): 224 if idx not in self._cache: 225 break 226 227 start = offset0 if idx == sector0 else None 228 end = offset1 if idx == (sector1 - 1) else None 229 data.append(self._cache[idx][start:end]) 230 231 return data 232 233 def _read_raw(self, start, end): 234 headers = {"Range": "bytes=%d-%d" % (start, end - 1)} 235 headers.update(self._kwargs.get("headers", {})) 236 kwargs = dict(self._kwargs) 237 kwargs['headers'] = headers 238 response = self._session.get( 239 self.url, 240 **kwargs) 241 response.raise_for_status() 242 return response.content 243 244 def _assert_not_closed(self): 245 if self.closed: 246 raise HTTPIOError("I/O operation on closed resource") 247 248 249if version_info[0] > 3 or (version_info[0] == 3 and version_info[1] >= 6): 250 from .asyncio import AsyncHTTPIOFileContextManagerMixin 251 252 class HTTPIOFile (SyncHTTPIOFile, AsyncHTTPIOFileContextManagerMixin): 253 pass 254else: 255 class HTTPIOFile(SyncHTTPIOFile): 256 pass
def
open(url, block_size=-1, **kwargs):
19def open(url, block_size=-1, **kwargs): 20 """ 21 Open a URL as a file-like object 22 23 :param url: The URL of the file to open 24 :param block_size: The cache block size, or `-1` to disable caching. 25 :param kwargs: Additional arguments to pass to `requests.Request()` 26 :return: An `httpio.HTTPIOFile` object supporting most of the usual 27 file-like object methods. 28 """ 29 f = HTTPIOFile(url, block_size, **kwargs) 30 f.open() 31 return f
Open a URL as a file-like object
Parameters
- url: The URL of the file to open
- block_size: The cache block size, or
-1
to disable caching. - kwargs: Additional arguments to pass to
requests.Request()
Returns
An
httpio.HTTPIOFile
object supporting most of the usual file-like object methods.
class
HTTPIOError(builtins.OSError):
Base class for I/O related errors.
Inherited Members
- builtins.OSError
- OSError
- errno
- strerror
- filename
- filename2
- characters_written
- builtins.BaseException
- with_traceback
- add_note
- args
Base class for buffered IO objects.
The main difference with RawIOBase is that the read() method supports omitting the size argument, and does not have a default implementation that defers to readinto().
In addition, read(), readinto() and write() may raise BlockingIOError if the underlying raw stream is in non-blocking mode and not ready; unlike their raw counterparts, they will never return None.
A typical implementation should not inherit from a RawIOBase implementation, but wrap one.
Inherited Members
- SyncHTTPIOFile
- SyncHTTPIOFile
- url
- block_size
- no_head_request
- length
- open
- close
- flush
- peek
- read
- read1
- readable
- readinto
- readinto1
- seek
- seekable
- tell
- write
- _io._BufferedIOBase
- detach
- _io._IOBase
- truncate
- writable
- fileno
- isatty
- readline
- readlines
- writelines
- closed