You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1893 lines
70 KiB
1893 lines
70 KiB
from .decoders import * |
|
from .exceptions import * |
|
|
|
import os |
|
import re |
|
import sys |
|
import shutil |
|
import logging |
|
import tempfile |
|
from io import BytesIO |
|
from numbers import Number |
|
|
|
# Unique missing object. |
|
_missing = object() |
|
|
|
# States for the querystring parser. |
|
STATE_BEFORE_FIELD = 0 |
|
STATE_FIELD_NAME = 1 |
|
STATE_FIELD_DATA = 2 |
|
|
|
# States for the multipart parser |
|
STATE_START = 0 |
|
STATE_START_BOUNDARY = 1 |
|
STATE_HEADER_FIELD_START = 2 |
|
STATE_HEADER_FIELD = 3 |
|
STATE_HEADER_VALUE_START = 4 |
|
STATE_HEADER_VALUE = 5 |
|
STATE_HEADER_VALUE_ALMOST_DONE = 6 |
|
STATE_HEADERS_ALMOST_DONE = 7 |
|
STATE_PART_DATA_START = 8 |
|
STATE_PART_DATA = 9 |
|
STATE_PART_DATA_END = 10 |
|
STATE_END = 11 |
|
|
|
STATES = [ |
|
"START", |
|
"START_BOUNDARY", "HEADER_FIELD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE", |
|
"HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END" |
|
] |
|
|
|
|
|
# Flags for the multipart parser. |
|
FLAG_PART_BOUNDARY = 1 |
|
FLAG_LAST_BOUNDARY = 2 |
|
|
|
# Get constants. Since iterating over a str on Python 2 gives you a 1-length |
|
# string, but iterating over a bytes object on Python 3 gives you an integer, |
|
# we need to save these constants. |
|
CR = b'\r'[0] |
|
LF = b'\n'[0] |
|
COLON = b':'[0] |
|
SPACE = b' '[0] |
|
HYPHEN = b'-'[0] |
|
AMPERSAND = b'&'[0] |
|
SEMICOLON = b';'[0] |
|
LOWER_A = b'a'[0] |
|
LOWER_Z = b'z'[0] |
|
NULL = b'\x00'[0] |
|
|
|
# Lower-casing a character is different, because of the difference between |
|
# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte, |
|
# and joining a list of bytes together. |
|
# These functions abstract that. |
|
lower_char = lambda c: c | 0x20 |
|
ord_char = lambda c: c |
|
join_bytes = lambda b: bytes(list(b)) |
|
|
|
# These are regexes for parsing header values. |
|
SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t') |
|
QUOTED_STR = br'"(?:\\.|[^"])*"' |
|
VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')' |
|
OPTION_RE_STR = ( |
|
br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')' |
|
) |
|
OPTION_RE = re.compile(OPTION_RE_STR) |
|
QUOTE = b'"'[0] |
|
|
|
|
|
def parse_options_header(value): |
|
""" |
|
Parses a Content-Type header into a value in the following format: |
|
(content_type, {parameters}) |
|
""" |
|
if not value: |
|
return (b'', {}) |
|
|
|
# If we are passed a string, we assume that it conforms to WSGI and does |
|
# not contain any code point that's not in latin-1. |
|
if isinstance(value, str): # pragma: no cover |
|
value = value.encode('latin-1') |
|
|
|
# If we have no options, return the string as-is. |
|
if b';' not in value: |
|
return (value.lower().strip(), {}) |
|
|
|
# Split at the first semicolon, to get our value and then options. |
|
ctype, rest = value.split(b';', 1) |
|
options = {} |
|
|
|
# Parse the options. |
|
for match in OPTION_RE.finditer(rest): |
|
key = match.group(1).lower() |
|
value = match.group(2) |
|
if value[0] == QUOTE and value[-1] == QUOTE: |
|
# Unquote the value. |
|
value = value[1:-1] |
|
value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') |
|
|
|
# If the value is a filename, we need to fix a bug on IE6 that sends |
|
# the full file path instead of the filename. |
|
if key == b'filename': |
|
if value[1:3] == b':\\' or value[:2] == b'\\\\': |
|
value = value.split(b'\\')[-1] |
|
|
|
options[key] = value |
|
|
|
return ctype, options |
|
|
|
|
|
class Field: |
|
"""A Field object represents a (parsed) form field. It represents a single |
|
field with a corresponding name and value. |
|
|
|
The name that a :class:`Field` will be instantiated with is the same name |
|
that would be found in the following HTML:: |
|
|
|
<input name="name_goes_here" type="text"/> |
|
|
|
This class defines two methods, :meth:`on_data` and :meth:`on_end`, that |
|
will be called when data is written to the Field, and when the Field is |
|
finalized, respectively. |
|
|
|
:param name: the name of the form field |
|
""" |
|
def __init__(self, name): |
|
self._name = name |
|
self._value = [] |
|
|
|
# We cache the joined version of _value for speed. |
|
self._cache = _missing |
|
|
|
@classmethod |
|
def from_value(klass, name, value): |
|
"""Create an instance of a :class:`Field`, and set the corresponding |
|
value - either None or an actual value. This method will also |
|
finalize the Field itself. |
|
|
|
:param name: the name of the form field |
|
:param value: the value of the form field - either a bytestring or |
|
None |
|
""" |
|
|
|
f = klass(name) |
|
if value is None: |
|
f.set_none() |
|
else: |
|
f.write(value) |
|
f.finalize() |
|
return f |
|
|
|
def write(self, data): |
|
"""Write some data into the form field. |
|
|
|
:param data: a bytestring |
|
""" |
|
return self.on_data(data) |
|
|
|
def on_data(self, data): |
|
"""This method is a callback that will be called whenever data is |
|
written to the Field. |
|
|
|
:param data: a bytestring |
|
""" |
|
self._value.append(data) |
|
self._cache = _missing |
|
return len(data) |
|
|
|
def on_end(self): |
|
"""This method is called whenever the Field is finalized. |
|
""" |
|
if self._cache is _missing: |
|
self._cache = b''.join(self._value) |
|
|
|
def finalize(self): |
|
"""Finalize the form field. |
|
""" |
|
self.on_end() |
|
|
|
def close(self): |
|
"""Close the Field object. This will free any underlying cache. |
|
""" |
|
# Free our value array. |
|
if self._cache is _missing: |
|
self._cache = b''.join(self._value) |
|
|
|
del self._value |
|
|
|
def set_none(self): |
|
"""Some fields in a querystring can possibly have a value of None - for |
|
example, the string "foo&bar=&baz=asdf" will have a field with the |
|
name "foo" and value None, one with name "bar" and value "", and one |
|
with name "baz" and value "asdf". Since the write() interface doesn't |
|
support writing None, this function will set the field value to None. |
|
""" |
|
self._cache = None |
|
|
|
@property |
|
def field_name(self): |
|
"""This property returns the name of the field.""" |
|
return self._name |
|
|
|
@property |
|
def value(self): |
|
"""This property returns the value of the form field.""" |
|
if self._cache is _missing: |
|
self._cache = b''.join(self._value) |
|
|
|
return self._cache |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, Field): |
|
return ( |
|
self.field_name == other.field_name and |
|
self.value == other.value |
|
) |
|
else: |
|
return NotImplemented |
|
|
|
def __repr__(self): |
|
if len(self.value) > 97: |
|
# We get the repr, and then insert three dots before the final |
|
# quote. |
|
v = repr(self.value[:97])[:-1] + "...'" |
|
else: |
|
v = repr(self.value) |
|
|
|
return "{}(field_name={!r}, value={})".format( |
|
self.__class__.__name__, |
|
self.field_name, |
|
v |
|
) |
|
|
|
|
|
class File: |
|
"""This class represents an uploaded file. It handles writing file data to |
|
either an in-memory file or a temporary file on-disk, if the optional |
|
threshold is passed. |
|
|
|
There are some options that can be passed to the File to change behavior |
|
of the class. Valid options are as follows: |
|
|
|
.. list-table:: |
|
:widths: 15 5 5 30 |
|
:header-rows: 1 |
|
|
|
* - Name |
|
- Type |
|
- Default |
|
- Description |
|
* - UPLOAD_DIR |
|
- `str` |
|
- None |
|
- The directory to store uploaded files in. If this is None, a |
|
temporary file will be created in the system's standard location. |
|
* - UPLOAD_DELETE_TMP |
|
- `bool` |
|
- True |
|
- Delete automatically created TMP file |
|
* - UPLOAD_KEEP_FILENAME |
|
- `bool` |
|
- False |
|
- Whether or not to keep the filename of the uploaded file. If True, |
|
then the filename will be converted to a safe representation (e.g. |
|
by removing any invalid path segments), and then saved with the |
|
same name). Otherwise, a temporary name will be used. |
|
* - UPLOAD_KEEP_EXTENSIONS |
|
- `bool` |
|
- False |
|
- Whether or not to keep the uploaded file's extension. If False, the |
|
file will be saved with the default temporary extension (usually |
|
".tmp"). Otherwise, the file's extension will be maintained. Note |
|
that this will properly combine with the UPLOAD_KEEP_FILENAME |
|
setting. |
|
* - MAX_MEMORY_FILE_SIZE |
|
- `int` |
|
- 1 MiB |
|
- The maximum number of bytes of a File to keep in memory. By |
|
default, the contents of a File are kept into memory until a certain |
|
limit is reached, after which the contents of the File are written |
|
to a temporary file. This behavior can be disabled by setting this |
|
value to an appropriately large value (or, for example, infinity, |
|
such as `float('inf')`. |
|
|
|
:param file_name: The name of the file that this :class:`File` represents |
|
|
|
:param field_name: The field name that uploaded this file. Note that this |
|
can be None, if, for example, the file was uploaded |
|
with Content-Type application/octet-stream |
|
|
|
:param config: The configuration for this File. See above for valid |
|
configuration keys and their corresponding values. |
|
""" |
|
def __init__(self, file_name, field_name=None, config={}): |
|
# Save configuration, set other variables default. |
|
self.logger = logging.getLogger(__name__) |
|
self._config = config |
|
self._in_memory = True |
|
self._bytes_written = 0 |
|
self._fileobj = BytesIO() |
|
|
|
# Save the provided field/file name. |
|
self._field_name = field_name |
|
self._file_name = file_name |
|
|
|
# Our actual file name is None by default, since, depending on our |
|
# config, we may not actually use the provided name. |
|
self._actual_file_name = None |
|
|
|
# Split the extension from the filename. |
|
if file_name is not None: |
|
base, ext = os.path.splitext(file_name) |
|
self._file_base = base |
|
self._ext = ext |
|
|
|
@property |
|
def field_name(self): |
|
"""The form field associated with this file. May be None if there isn't |
|
one, for example when we have an application/octet-stream upload. |
|
""" |
|
return self._field_name |
|
|
|
@property |
|
def file_name(self): |
|
"""The file name given in the upload request. |
|
""" |
|
return self._file_name |
|
|
|
@property |
|
def actual_file_name(self): |
|
"""The file name that this file is saved as. Will be None if it's not |
|
currently saved on disk. |
|
""" |
|
return self._actual_file_name |
|
|
|
@property |
|
def file_object(self): |
|
"""The file object that we're currently writing to. Note that this |
|
will either be an instance of a :class:`io.BytesIO`, or a regular file |
|
object. |
|
""" |
|
return self._fileobj |
|
|
|
@property |
|
def size(self): |
|
"""The total size of this file, counted as the number of bytes that |
|
currently have been written to the file. |
|
""" |
|
return self._bytes_written |
|
|
|
@property |
|
def in_memory(self): |
|
"""A boolean representing whether or not this file object is currently |
|
stored in-memory or on-disk. |
|
""" |
|
return self._in_memory |
|
|
|
def flush_to_disk(self): |
|
"""If the file is already on-disk, do nothing. Otherwise, copy from |
|
the in-memory buffer to a disk file, and then reassign our internal |
|
file object to this new disk file. |
|
|
|
Note that if you attempt to flush a file that is already on-disk, a |
|
warning will be logged to this module's logger. |
|
""" |
|
if not self._in_memory: |
|
self.logger.warning( |
|
"Trying to flush to disk when we're not in memory" |
|
) |
|
return |
|
|
|
# Go back to the start of our file. |
|
self._fileobj.seek(0) |
|
|
|
# Open a new file. |
|
new_file = self._get_disk_file() |
|
|
|
# Copy the file objects. |
|
shutil.copyfileobj(self._fileobj, new_file) |
|
|
|
# Seek to the new position in our new file. |
|
new_file.seek(self._bytes_written) |
|
|
|
# Reassign the fileobject. |
|
old_fileobj = self._fileobj |
|
self._fileobj = new_file |
|
|
|
# We're no longer in memory. |
|
self._in_memory = False |
|
|
|
# Close the old file object. |
|
old_fileobj.close() |
|
|
|
def _get_disk_file(self): |
|
"""This function is responsible for getting a file object on-disk for us. |
|
""" |
|
self.logger.info("Opening a file on disk") |
|
|
|
file_dir = self._config.get('UPLOAD_DIR') |
|
keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False) |
|
keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False) |
|
delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True) |
|
|
|
# If we have a directory and are to keep the filename... |
|
if file_dir is not None and keep_filename: |
|
self.logger.info("Saving with filename in: %r", file_dir) |
|
|
|
# Build our filename. |
|
# TODO: what happens if we don't have a filename? |
|
fname = self._file_base |
|
if keep_extensions: |
|
fname = fname + self._ext |
|
|
|
path = os.path.join(file_dir, fname) |
|
try: |
|
self.logger.info("Opening file: %r", path) |
|
tmp_file = open(path, 'w+b') |
|
except OSError as e: |
|
tmp_file = None |
|
|
|
self.logger.exception("Error opening temporary file") |
|
raise FileError("Error opening temporary file: %r" % path) |
|
else: |
|
# Build options array. |
|
# Note that on Python 3, tempfile doesn't support byte names. We |
|
# encode our paths using the default filesystem encoding. |
|
options = {} |
|
if keep_extensions: |
|
ext = self._ext |
|
if isinstance(ext, bytes): |
|
ext = ext.decode(sys.getfilesystemencoding()) |
|
|
|
options['suffix'] = ext |
|
if file_dir is not None: |
|
d = file_dir |
|
if isinstance(d, bytes): |
|
d = d.decode(sys.getfilesystemencoding()) |
|
|
|
options['dir'] = d |
|
options['delete'] = delete_tmp |
|
|
|
# Create a temporary (named) file with the appropriate settings. |
|
self.logger.info("Creating a temporary file with options: %r", |
|
options) |
|
try: |
|
tmp_file = tempfile.NamedTemporaryFile(**options) |
|
except OSError: |
|
self.logger.exception("Error creating named temporary file") |
|
raise FileError("Error creating named temporary file") |
|
|
|
fname = tmp_file.name |
|
|
|
# Encode filename as bytes. |
|
if isinstance(fname, str): |
|
fname = fname.encode(sys.getfilesystemencoding()) |
|
|
|
self._actual_file_name = fname |
|
return tmp_file |
|
|
|
def write(self, data): |
|
"""Write some data to the File. |
|
|
|
:param data: a bytestring |
|
""" |
|
return self.on_data(data) |
|
|
|
def on_data(self, data): |
|
"""This method is a callback that will be called whenever data is |
|
written to the File. |
|
|
|
:param data: a bytestring |
|
""" |
|
pos = self._fileobj.tell() |
|
bwritten = self._fileobj.write(data) |
|
# true file objects write returns None |
|
if bwritten is None: |
|
bwritten = self._fileobj.tell() - pos |
|
|
|
# If the bytes written isn't the same as the length, just return. |
|
if bwritten != len(data): |
|
self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, |
|
len(data)) |
|
return bwritten |
|
|
|
# Keep track of how many bytes we've written. |
|
self._bytes_written += bwritten |
|
|
|
# If we're in-memory and are over our limit, we create a file. |
|
if (self._in_memory and |
|
self._config.get('MAX_MEMORY_FILE_SIZE') is not None and |
|
(self._bytes_written > |
|
self._config.get('MAX_MEMORY_FILE_SIZE'))): |
|
self.logger.info("Flushing to disk") |
|
self.flush_to_disk() |
|
|
|
# Return the number of bytes written. |
|
return bwritten |
|
|
|
def on_end(self): |
|
"""This method is called whenever the Field is finalized. |
|
""" |
|
# Flush the underlying file object |
|
self._fileobj.flush() |
|
|
|
def finalize(self): |
|
"""Finalize the form file. This will not close the underlying file, |
|
but simply signal that we are finished writing to the File. |
|
""" |
|
self.on_end() |
|
|
|
def close(self): |
|
"""Close the File object. This will actually close the underlying |
|
file object (whether it's a :class:`io.BytesIO` or an actual file |
|
object). |
|
""" |
|
self._fileobj.close() |
|
|
|
def __repr__(self): |
|
return "{}(file_name={!r}, field_name={!r})".format( |
|
self.__class__.__name__, |
|
self.file_name, |
|
self.field_name |
|
) |
|
|
|
|
|
class BaseParser: |
|
"""This class is the base class for all parsers. It contains the logic for |
|
calling and adding callbacks. |
|
|
|
A callback can be one of two different forms. "Notification callbacks" are |
|
callbacks that are called when something happens - for example, when a new |
|
part of a multipart message is encountered by the parser. "Data callbacks" |
|
are called when we get some sort of data - for example, part of the body of |
|
a multipart chunk. Notification callbacks are called with no parameters, |
|
whereas data callbacks are called with three, as follows:: |
|
|
|
data_callback(data, start, end) |
|
|
|
The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on |
|
Python 3). "start" and "end" are integer indexes into the "data" string |
|
that represent the data of interest. Thus, in a data callback, the slice |
|
`data[start:end]` represents the data that the callback is "interested in". |
|
The callback is not passed a copy of the data, since copying severely hurts |
|
performance. |
|
""" |
|
def __init__(self): |
|
self.logger = logging.getLogger(__name__) |
|
|
|
def callback(self, name, data=None, start=None, end=None): |
|
"""This function calls a provided callback with some data. If the |
|
callback is not set, will do nothing. |
|
|
|
:param name: The name of the callback to call (as a string). |
|
|
|
:param data: Data to pass to the callback. If None, then it is |
|
assumed that the callback is a notification callback, |
|
and no parameters are given. |
|
|
|
:param end: An integer that is passed to the data callback. |
|
|
|
:param start: An integer that is passed to the data callback. |
|
""" |
|
name = "on_" + name |
|
func = self.callbacks.get(name) |
|
if func is None: |
|
return |
|
|
|
# Depending on whether we're given a buffer... |
|
if data is not None: |
|
# Don't do anything if we have start == end. |
|
if start is not None and start == end: |
|
return |
|
|
|
self.logger.debug("Calling %s with data[%d:%d]", name, start, end) |
|
func(data, start, end) |
|
else: |
|
self.logger.debug("Calling %s with no data", name) |
|
func() |
|
|
|
def set_callback(self, name, new_func): |
|
"""Update the function for a callback. Removes from the callbacks dict |
|
if new_func is None. |
|
|
|
:param name: The name of the callback to call (as a string). |
|
|
|
:param new_func: The new function for the callback. If None, then the |
|
callback will be removed (with no error if it does not |
|
exist). |
|
""" |
|
if new_func is None: |
|
self.callbacks.pop('on_' + name, None) |
|
else: |
|
self.callbacks['on_' + name] = new_func |
|
|
|
def close(self): |
|
pass # pragma: no cover |
|
|
|
def finalize(self): |
|
pass # pragma: no cover |
|
|
|
def __repr__(self): |
|
return "%s()" % self.__class__.__name__ |
|
|
|
|
|
class OctetStreamParser(BaseParser): |
|
"""This parser parses an octet-stream request body and calls callbacks when |
|
incoming data is received. Callbacks are as follows: |
|
|
|
.. list-table:: |
|
:widths: 15 10 30 |
|
:header-rows: 1 |
|
|
|
* - Callback Name |
|
- Parameters |
|
- Description |
|
* - on_start |
|
- None |
|
- Called when the first data is parsed. |
|
* - on_data |
|
- data, start, end |
|
- Called for each data chunk that is parsed. |
|
* - on_end |
|
- None |
|
- Called when the parser is finished parsing all data. |
|
|
|
:param callbacks: A dictionary of callbacks. See the documentation for |
|
:class:`BaseParser`. |
|
|
|
:param max_size: The maximum size of body to parse. Defaults to infinity - |
|
i.e. unbounded. |
|
""" |
|
def __init__(self, callbacks={}, max_size=float('inf')): |
|
super().__init__() |
|
self.callbacks = callbacks |
|
self._started = False |
|
|
|
if not isinstance(max_size, Number) or max_size < 1: |
|
raise ValueError("max_size must be a positive number, not %r" % |
|
max_size) |
|
self.max_size = max_size |
|
self._current_size = 0 |
|
|
|
def write(self, data): |
|
"""Write some data to the parser, which will perform size verification, |
|
and then pass the data to the underlying callback. |
|
|
|
:param data: a bytestring |
|
""" |
|
if not self._started: |
|
self.callback('start') |
|
self._started = True |
|
|
|
# Truncate data length. |
|
data_len = len(data) |
|
if (self._current_size + data_len) > self.max_size: |
|
# We truncate the length of data that we are to process. |
|
new_size = int(self.max_size - self._current_size) |
|
self.logger.warning("Current size is %d (max %d), so truncating " |
|
"data length from %d to %d", |
|
self._current_size, self.max_size, data_len, |
|
new_size) |
|
data_len = new_size |
|
|
|
# Increment size, then callback, in case there's an exception. |
|
self._current_size += data_len |
|
self.callback('data', data, 0, data_len) |
|
return data_len |
|
|
|
def finalize(self): |
|
"""Finalize this parser, which signals to that we are finished parsing, |
|
and sends the on_end callback. |
|
""" |
|
self.callback('end') |
|
|
|
def __repr__(self): |
|
return "%s()" % self.__class__.__name__ |
|
|
|
|
|
class QuerystringParser(BaseParser): |
|
"""This is a streaming querystring parser. It will consume data, and call |
|
the callbacks given when it has data. |
|
|
|
.. list-table:: |
|
:widths: 15 10 30 |
|
:header-rows: 1 |
|
|
|
* - Callback Name |
|
- Parameters |
|
- Description |
|
* - on_field_start |
|
- None |
|
- Called when a new field is encountered. |
|
* - on_field_name |
|
- data, start, end |
|
- Called when a portion of a field's name is encountered. |
|
* - on_field_data |
|
- data, start, end |
|
- Called when a portion of a field's data is encountered. |
|
* - on_field_end |
|
- None |
|
- Called when the end of a field is encountered. |
|
* - on_end |
|
- None |
|
- Called when the parser is finished parsing all data. |
|
|
|
:param callbacks: A dictionary of callbacks. See the documentation for |
|
:class:`BaseParser`. |
|
|
|
:param strict_parsing: Whether or not to parse the body strictly. Defaults |
|
to False. If this is set to True, then the behavior |
|
of the parser changes as the following: if a field |
|
has a value with an equal sign (e.g. "foo=bar", or |
|
"foo="), it is always included. If a field has no |
|
equals sign (e.g. "...&name&..."), it will be |
|
treated as an error if 'strict_parsing' is True, |
|
otherwise included. If an error is encountered, |
|
then a |
|
:class:`multipart.exceptions.QuerystringParseError` |
|
will be raised. |
|
|
|
:param max_size: The maximum size of body to parse. Defaults to infinity - |
|
i.e. unbounded. |
|
""" |
|
def __init__(self, callbacks={}, strict_parsing=False, |
|
max_size=float('inf')): |
|
super().__init__() |
|
self.state = STATE_BEFORE_FIELD |
|
self._found_sep = False |
|
|
|
self.callbacks = callbacks |
|
|
|
# Max-size stuff |
|
if not isinstance(max_size, Number) or max_size < 1: |
|
raise ValueError("max_size must be a positive number, not %r" % |
|
max_size) |
|
self.max_size = max_size |
|
self._current_size = 0 |
|
|
|
# Should parsing be strict? |
|
self.strict_parsing = strict_parsing |
|
|
|
def write(self, data): |
|
"""Write some data to the parser, which will perform size verification, |
|
parse into either a field name or value, and then pass the |
|
corresponding data to the underlying callback. If an error is |
|
encountered while parsing, a QuerystringParseError will be raised. The |
|
"offset" attribute of the raised exception will be set to the offset in |
|
the input data chunk (NOT the overall stream) that caused the error. |
|
|
|
:param data: a bytestring |
|
""" |
|
# Handle sizing. |
|
data_len = len(data) |
|
if (self._current_size + data_len) > self.max_size: |
|
# We truncate the length of data that we are to process. |
|
new_size = int(self.max_size - self._current_size) |
|
self.logger.warning("Current size is %d (max %d), so truncating " |
|
"data length from %d to %d", |
|
self._current_size, self.max_size, data_len, |
|
new_size) |
|
data_len = new_size |
|
|
|
l = 0 |
|
try: |
|
l = self._internal_write(data, data_len) |
|
finally: |
|
self._current_size += l |
|
|
|
return l |
|
|
|
def _internal_write(self, data, length): |
|
state = self.state |
|
strict_parsing = self.strict_parsing |
|
found_sep = self._found_sep |
|
|
|
i = 0 |
|
while i < length: |
|
ch = data[i] |
|
|
|
# Depending on our state... |
|
if state == STATE_BEFORE_FIELD: |
|
# If the 'found_sep' flag is set, we've already encountered |
|
# and skipped a single separator. If so, we check our strict |
|
# parsing flag and decide what to do. Otherwise, we haven't |
|
# yet reached a separator, and thus, if we do, we need to skip |
|
# it as it will be the boundary between fields that's supposed |
|
# to be there. |
|
if ch == AMPERSAND or ch == SEMICOLON: |
|
if found_sep: |
|
# If we're parsing strictly, we disallow blank chunks. |
|
if strict_parsing: |
|
e = QuerystringParseError( |
|
"Skipping duplicate ampersand/semicolon at " |
|
"%d" % i |
|
) |
|
e.offset = i |
|
raise e |
|
else: |
|
self.logger.debug("Skipping duplicate ampersand/" |
|
"semicolon at %d", i) |
|
else: |
|
# This case is when we're skipping the (first) |
|
# separator between fields, so we just set our flag |
|
# and continue on. |
|
found_sep = True |
|
else: |
|
# Emit a field-start event, and go to that state. Also, |
|
# reset the "found_sep" flag, for the next time we get to |
|
# this state. |
|
self.callback('field_start') |
|
i -= 1 |
|
state = STATE_FIELD_NAME |
|
found_sep = False |
|
|
|
elif state == STATE_FIELD_NAME: |
|
# Try and find a separator - we ensure that, if we do, we only |
|
# look for the equal sign before it. |
|
sep_pos = data.find(b'&', i) |
|
if sep_pos == -1: |
|
sep_pos = data.find(b';', i) |
|
|
|
# See if we can find an equals sign in the remaining data. If |
|
# so, we can immediately emit the field name and jump to the |
|
# data state. |
|
if sep_pos != -1: |
|
equals_pos = data.find(b'=', i, sep_pos) |
|
else: |
|
equals_pos = data.find(b'=', i) |
|
|
|
if equals_pos != -1: |
|
# Emit this name. |
|
self.callback('field_name', data, i, equals_pos) |
|
|
|
# Jump i to this position. Note that it will then have 1 |
|
# added to it below, which means the next iteration of this |
|
# loop will inspect the character after the equals sign. |
|
i = equals_pos |
|
state = STATE_FIELD_DATA |
|
else: |
|
# No equals sign found. |
|
if not strict_parsing: |
|
# See also comments in the STATE_FIELD_DATA case below. |
|
# If we found the separator, we emit the name and just |
|
# end - there's no data callback at all (not even with |
|
# a blank value). |
|
if sep_pos != -1: |
|
self.callback('field_name', data, i, sep_pos) |
|
self.callback('field_end') |
|
|
|
i = sep_pos - 1 |
|
state = STATE_BEFORE_FIELD |
|
else: |
|
# Otherwise, no separator in this block, so the |
|
# rest of this chunk must be a name. |
|
self.callback('field_name', data, i, length) |
|
i = length |
|
|
|
else: |
|
# We're parsing strictly. If we find a separator, |
|
# this is an error - we require an equals sign. |
|
if sep_pos != -1: |
|
e = QuerystringParseError( |
|
"When strict_parsing is True, we require an " |
|
"equals sign in all field chunks. Did not " |
|
"find one in the chunk that starts at %d" % |
|
(i,) |
|
) |
|
e.offset = i |
|
raise e |
|
|
|
# No separator in the rest of this chunk, so it's just |
|
# a field name. |
|
self.callback('field_name', data, i, length) |
|
i = length |
|
|
|
elif state == STATE_FIELD_DATA: |
|
# Try finding either an ampersand or a semicolon after this |
|
# position. |
|
sep_pos = data.find(b'&', i) |
|
if sep_pos == -1: |
|
sep_pos = data.find(b';', i) |
|
|
|
# If we found it, callback this bit as data and then go back |
|
# to expecting to find a field. |
|
if sep_pos != -1: |
|
self.callback('field_data', data, i, sep_pos) |
|
self.callback('field_end') |
|
|
|
# Note that we go to the separator, which brings us to the |
|
# "before field" state. This allows us to properly emit |
|
# "field_start" events only when we actually have data for |
|
# a field of some sort. |
|
i = sep_pos - 1 |
|
state = STATE_BEFORE_FIELD |
|
|
|
# Otherwise, emit the rest as data and finish. |
|
else: |
|
self.callback('field_data', data, i, length) |
|
i = length |
|
|
|
else: # pragma: no cover (error case) |
|
msg = "Reached an unknown state %d at %d" % (state, i) |
|
self.logger.warning(msg) |
|
e = QuerystringParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
i += 1 |
|
|
|
self.state = state |
|
self._found_sep = found_sep |
|
return len(data) |
|
|
|
def finalize(self): |
|
"""Finalize this parser, which signals to that we are finished parsing, |
|
if we're still in the middle of a field, an on_field_end callback, and |
|
then the on_end callback. |
|
""" |
|
# If we're currently in the middle of a field, we finish it. |
|
if self.state == STATE_FIELD_DATA: |
|
self.callback('field_end') |
|
self.callback('end') |
|
|
|
def __repr__(self): |
|
return "{}(strict_parsing={!r}, max_size={!r})".format( |
|
self.__class__.__name__, |
|
self.strict_parsing, self.max_size |
|
) |
|
|
|
|
|
class MultipartParser(BaseParser): |
|
"""This class is a streaming multipart/form-data parser. |
|
|
|
.. list-table:: |
|
:widths: 15 10 30 |
|
:header-rows: 1 |
|
|
|
* - Callback Name |
|
- Parameters |
|
- Description |
|
* - on_part_begin |
|
- None |
|
- Called when a new part of the multipart message is encountered. |
|
* - on_part_data |
|
- data, start, end |
|
- Called when a portion of a part's data is encountered. |
|
* - on_part_end |
|
- None |
|
- Called when the end of a part is reached. |
|
* - on_header_begin |
|
- None |
|
- Called when we've found a new header in a part of a multipart |
|
message |
|
* - on_header_field |
|
- data, start, end |
|
- Called each time an additional portion of a header is read (i.e. the |
|
part of the header that is before the colon; the "Foo" in |
|
"Foo: Bar"). |
|
* - on_header_value |
|
- data, start, end |
|
- Called when we get data for a header. |
|
* - on_header_end |
|
- None |
|
- Called when the current header is finished - i.e. we've reached the |
|
newline at the end of the header. |
|
* - on_headers_finished |
|
- None |
|
- Called when all headers are finished, and before the part data |
|
starts. |
|
* - on_end |
|
- None |
|
- Called when the parser is finished parsing all data. |
|
|
|
|
|
:param boundary: The multipart boundary. This is required, and must match |
|
what is given in the HTTP request - usually in the |
|
Content-Type header. |
|
|
|
:param callbacks: A dictionary of callbacks. See the documentation for |
|
:class:`BaseParser`. |
|
|
|
:param max_size: The maximum size of body to parse. Defaults to infinity - |
|
i.e. unbounded. |
|
""" |
|
|
|
def __init__(self, boundary, callbacks={}, max_size=float('inf')): |
|
# Initialize parser state. |
|
super().__init__() |
|
self.state = STATE_START |
|
self.index = self.flags = 0 |
|
|
|
self.callbacks = callbacks |
|
|
|
if not isinstance(max_size, Number) or max_size < 1: |
|
raise ValueError("max_size must be a positive number, not %r" % |
|
max_size) |
|
self.max_size = max_size |
|
self._current_size = 0 |
|
|
|
# Setup marks. These are used to track the state of data received. |
|
self.marks = {} |
|
|
|
# TODO: Actually use this rather than the dumb version we currently use |
|
# # Precompute the skip table for the Boyer-Moore-Horspool algorithm. |
|
# skip = [len(boundary) for x in range(256)] |
|
# for i in range(len(boundary) - 1): |
|
# skip[ord_char(boundary[i])] = len(boundary) - i - 1 |
|
# |
|
# # We use a tuple since it's a constant, and marginally faster. |
|
# self.skip = tuple(skip) |
|
|
|
# Save our boundary. |
|
if isinstance(boundary, str): # pragma: no cover |
|
boundary = boundary.encode('latin-1') |
|
self.boundary = b'\r\n--' + boundary |
|
|
|
# Get a set of characters that belong to our boundary. |
|
self.boundary_chars = frozenset(self.boundary) |
|
|
|
# We also create a lookbehind list. |
|
# Note: the +8 is since we can have, at maximum, "\r\n--" + boundary + |
|
# "--\r\n" at the final boundary, and the length of '\r\n--' and |
|
# '--\r\n' is 8 bytes. |
|
self.lookbehind = [NULL for x in range(len(boundary) + 8)] |
|
|
|
def write(self, data): |
|
"""Write some data to the parser, which will perform size verification, |
|
and then parse the data into the appropriate location (e.g. header, |
|
data, etc.), and pass this on to the underlying callback. If an error |
|
is encountered, a MultipartParseError will be raised. The "offset" |
|
attribute on the raised exception will be set to the offset of the byte |
|
in the input chunk that caused the error. |
|
|
|
:param data: a bytestring |
|
""" |
|
# Handle sizing. |
|
data_len = len(data) |
|
if (self._current_size + data_len) > self.max_size: |
|
# We truncate the length of data that we are to process. |
|
new_size = int(self.max_size - self._current_size) |
|
self.logger.warning("Current size is %d (max %d), so truncating " |
|
"data length from %d to %d", |
|
self._current_size, self.max_size, data_len, |
|
new_size) |
|
data_len = new_size |
|
|
|
l = 0 |
|
try: |
|
l = self._internal_write(data, data_len) |
|
finally: |
|
self._current_size += l |
|
|
|
return l |
|
|
|
def _internal_write(self, data, length): |
|
# Get values from locals. |
|
boundary = self.boundary |
|
|
|
# Get our state, flags and index. These are persisted between calls to |
|
# this function. |
|
state = self.state |
|
index = self.index |
|
flags = self.flags |
|
|
|
# Our index defaults to 0. |
|
i = 0 |
|
|
|
# Set a mark. |
|
def set_mark(name): |
|
self.marks[name] = i |
|
|
|
# Remove a mark. |
|
def delete_mark(name, reset=False): |
|
self.marks.pop(name, None) |
|
|
|
# Helper function that makes calling a callback with data easier. The |
|
# 'remaining' parameter will callback from the marked value until the |
|
# end of the buffer, and reset the mark, instead of deleting it. This |
|
# is used at the end of the function to call our callbacks with any |
|
# remaining data in this chunk. |
|
def data_callback(name, remaining=False): |
|
marked_index = self.marks.get(name) |
|
if marked_index is None: |
|
return |
|
|
|
# If we're getting remaining data, we ignore the current i value |
|
# and just call with the remaining data. |
|
if remaining: |
|
self.callback(name, data, marked_index, length) |
|
self.marks[name] = 0 |
|
|
|
# Otherwise, we call it from the mark to the current byte we're |
|
# processing. |
|
else: |
|
self.callback(name, data, marked_index, i) |
|
self.marks.pop(name, None) |
|
|
|
# For each byte... |
|
while i < length: |
|
c = data[i] |
|
|
|
if state == STATE_START: |
|
# Skip leading newlines |
|
if c == CR or c == LF: |
|
i += 1 |
|
self.logger.debug("Skipping leading CR/LF at %d", i) |
|
continue |
|
|
|
# index is used as in index into our boundary. Set to 0. |
|
index = 0 |
|
|
|
# Move to the next state, but decrement i so that we re-process |
|
# this character. |
|
state = STATE_START_BOUNDARY |
|
i -= 1 |
|
|
|
elif state == STATE_START_BOUNDARY: |
|
# Check to ensure that the last 2 characters in our boundary |
|
# are CRLF. |
|
if index == len(boundary) - 2: |
|
if c != CR: |
|
# Error! |
|
msg = "Did not find CR at end of boundary (%d)" % (i,) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
index += 1 |
|
|
|
elif index == len(boundary) - 2 + 1: |
|
if c != LF: |
|
msg = "Did not find LF at end of boundary (%d)" % (i,) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
# The index is now used for indexing into our boundary. |
|
index = 0 |
|
|
|
# Callback for the start of a part. |
|
self.callback('part_begin') |
|
|
|
# Move to the next character and state. |
|
state = STATE_HEADER_FIELD_START |
|
|
|
else: |
|
# Check to ensure our boundary matches |
|
if c != boundary[index + 2]: |
|
msg = "Did not find boundary character %r at index " \ |
|
"%d" % (c, index + 2) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
# Increment index into boundary and continue. |
|
index += 1 |
|
|
|
elif state == STATE_HEADER_FIELD_START: |
|
# Mark the start of a header field here, reset the index, and |
|
# continue parsing our header field. |
|
index = 0 |
|
|
|
# Set a mark of our header field. |
|
set_mark('header_field') |
|
|
|
# Move to parsing header fields. |
|
state = STATE_HEADER_FIELD |
|
i -= 1 |
|
|
|
elif state == STATE_HEADER_FIELD: |
|
# If we've reached a CR at the beginning of a header, it means |
|
# that we've reached the second of 2 newlines, and so there are |
|
# no more headers to parse. |
|
if c == CR: |
|
delete_mark('header_field') |
|
state = STATE_HEADERS_ALMOST_DONE |
|
i += 1 |
|
continue |
|
|
|
# Increment our index in the header. |
|
index += 1 |
|
|
|
# Do nothing if we encounter a hyphen. |
|
if c == HYPHEN: |
|
pass |
|
|
|
# If we've reached a colon, we're done with this header. |
|
elif c == COLON: |
|
# A 0-length header is an error. |
|
if index == 1: |
|
msg = "Found 0-length header at %d" % (i,) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
# Call our callback with the header field. |
|
data_callback('header_field') |
|
|
|
# Move to parsing the header value. |
|
state = STATE_HEADER_VALUE_START |
|
|
|
else: |
|
# Lower-case this character, and ensure that it is in fact |
|
# a valid letter. If not, it's an error. |
|
cl = lower_char(c) |
|
if cl < LOWER_A or cl > LOWER_Z: |
|
msg = "Found non-alphanumeric character %r in " \ |
|
"header at %d" % (c, i) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
elif state == STATE_HEADER_VALUE_START: |
|
# Skip leading spaces. |
|
if c == SPACE: |
|
i += 1 |
|
continue |
|
|
|
# Mark the start of the header value. |
|
set_mark('header_value') |
|
|
|
# Move to the header-value state, reprocessing this character. |
|
state = STATE_HEADER_VALUE |
|
i -= 1 |
|
|
|
elif state == STATE_HEADER_VALUE: |
|
# If we've got a CR, we're nearly done our headers. Otherwise, |
|
# we do nothing and just move past this character. |
|
if c == CR: |
|
data_callback('header_value') |
|
self.callback('header_end') |
|
state = STATE_HEADER_VALUE_ALMOST_DONE |
|
|
|
elif state == STATE_HEADER_VALUE_ALMOST_DONE: |
|
# The last character should be a LF. If not, it's an error. |
|
if c != LF: |
|
msg = "Did not find LF character at end of header " \ |
|
"(found %r)" % (c,) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
# Move back to the start of another header. Note that if that |
|
# state detects ANOTHER newline, it'll trigger the end of our |
|
# headers. |
|
state = STATE_HEADER_FIELD_START |
|
|
|
elif state == STATE_HEADERS_ALMOST_DONE: |
|
# We're almost done our headers. This is reached when we parse |
|
# a CR at the beginning of a header, so our next character |
|
# should be a LF, or it's an error. |
|
if c != LF: |
|
msg = f"Did not find LF at end of headers (found {c!r})" |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
self.callback('headers_finished') |
|
state = STATE_PART_DATA_START |
|
|
|
elif state == STATE_PART_DATA_START: |
|
# Mark the start of our part data. |
|
set_mark('part_data') |
|
|
|
# Start processing part data, including this character. |
|
state = STATE_PART_DATA |
|
i -= 1 |
|
|
|
elif state == STATE_PART_DATA: |
|
# We're processing our part data right now. During this, we |
|
# need to efficiently search for our boundary, since any data |
|
# on any number of lines can be a part of the current data. |
|
# We use the Boyer-Moore-Horspool algorithm to efficiently |
|
# search through the remainder of the buffer looking for our |
|
# boundary. |
|
|
|
# Save the current value of our index. We use this in case we |
|
# find part of a boundary, but it doesn't match fully. |
|
prev_index = index |
|
|
|
# Set up variables. |
|
boundary_length = len(boundary) |
|
boundary_end = boundary_length - 1 |
|
data_length = length |
|
boundary_chars = self.boundary_chars |
|
|
|
# If our index is 0, we're starting a new part, so start our |
|
# search. |
|
if index == 0: |
|
# Search forward until we either hit the end of our buffer, |
|
# or reach a character that's in our boundary. |
|
i += boundary_end |
|
while i < data_length - 1 and data[i] not in boundary_chars: |
|
i += boundary_length |
|
|
|
# Reset i back the length of our boundary, which is the |
|
# earliest possible location that could be our match (i.e. |
|
# if we've just broken out of our loop since we saw the |
|
# last character in our boundary) |
|
i -= boundary_end |
|
c = data[i] |
|
|
|
# Now, we have a couple of cases here. If our index is before |
|
# the end of the boundary... |
|
if index < boundary_length: |
|
# If the character matches... |
|
if boundary[index] == c: |
|
# If we found a match for our boundary, we send the |
|
# existing data. |
|
if index == 0: |
|
data_callback('part_data') |
|
|
|
# The current character matches, so continue! |
|
index += 1 |
|
else: |
|
index = 0 |
|
|
|
# Our index is equal to the length of our boundary! |
|
elif index == boundary_length: |
|
# First we increment it. |
|
index += 1 |
|
|
|
# Now, if we've reached a newline, we need to set this as |
|
# the potential end of our boundary. |
|
if c == CR: |
|
flags |= FLAG_PART_BOUNDARY |
|
|
|
# Otherwise, if this is a hyphen, we might be at the last |
|
# of all boundaries. |
|
elif c == HYPHEN: |
|
flags |= FLAG_LAST_BOUNDARY |
|
|
|
# Otherwise, we reset our index, since this isn't either a |
|
# newline or a hyphen. |
|
else: |
|
index = 0 |
|
|
|
# Our index is right after the part boundary, which should be |
|
# a LF. |
|
elif index == boundary_length + 1: |
|
# If we're at a part boundary (i.e. we've seen a CR |
|
# character already)... |
|
if flags & FLAG_PART_BOUNDARY: |
|
# We need a LF character next. |
|
if c == LF: |
|
# Unset the part boundary flag. |
|
flags &= (~FLAG_PART_BOUNDARY) |
|
|
|
# Callback indicating that we've reached the end of |
|
# a part, and are starting a new one. |
|
self.callback('part_end') |
|
self.callback('part_begin') |
|
|
|
# Move to parsing new headers. |
|
index = 0 |
|
state = STATE_HEADER_FIELD_START |
|
i += 1 |
|
continue |
|
|
|
# We didn't find an LF character, so no match. Reset |
|
# our index and clear our flag. |
|
index = 0 |
|
flags &= (~FLAG_PART_BOUNDARY) |
|
|
|
# Otherwise, if we're at the last boundary (i.e. we've |
|
# seen a hyphen already)... |
|
elif flags & FLAG_LAST_BOUNDARY: |
|
# We need a second hyphen here. |
|
if c == HYPHEN: |
|
# Callback to end the current part, and then the |
|
# message. |
|
self.callback('part_end') |
|
self.callback('end') |
|
state = STATE_END |
|
else: |
|
# No match, so reset index. |
|
index = 0 |
|
|
|
# If we have an index, we need to keep this byte for later, in |
|
# case we can't match the full boundary. |
|
if index > 0: |
|
self.lookbehind[index - 1] = c |
|
|
|
# Otherwise, our index is 0. If the previous index is not, it |
|
# means we reset something, and we need to take the data we |
|
# thought was part of our boundary and send it along as actual |
|
# data. |
|
elif prev_index > 0: |
|
# Callback to write the saved data. |
|
lb_data = join_bytes(self.lookbehind) |
|
self.callback('part_data', lb_data, 0, prev_index) |
|
|
|
# Overwrite our previous index. |
|
prev_index = 0 |
|
|
|
# Re-set our mark for part data. |
|
set_mark('part_data') |
|
|
|
# Re-consider the current character, since this could be |
|
# the start of the boundary itself. |
|
i -= 1 |
|
|
|
elif state == STATE_END: |
|
# Do nothing and just consume a byte in the end state. |
|
if c not in (CR, LF): |
|
self.logger.warning("Consuming a byte '0x%x' in the end state", c) |
|
|
|
else: # pragma: no cover (error case) |
|
# We got into a strange state somehow! Just stop processing. |
|
msg = "Reached an unknown state %d at %d" % (state, i) |
|
self.logger.warning(msg) |
|
e = MultipartParseError(msg) |
|
e.offset = i |
|
raise e |
|
|
|
# Move to the next byte. |
|
i += 1 |
|
|
|
# We call our callbacks with any remaining data. Note that we pass |
|
# the 'remaining' flag, which sets the mark back to 0 instead of |
|
# deleting it, if it's found. This is because, if the mark is found |
|
# at this point, we assume that there's data for one of these things |
|
# that has been parsed, but not yet emitted. And, as such, it implies |
|
# that we haven't yet reached the end of this 'thing'. So, by setting |
|
# the mark to 0, we cause any data callbacks that take place in future |
|
# calls to this function to start from the beginning of that buffer. |
|
data_callback('header_field', True) |
|
data_callback('header_value', True) |
|
data_callback('part_data', True) |
|
|
|
# Save values to locals. |
|
self.state = state |
|
self.index = index |
|
self.flags = flags |
|
|
|
# Return our data length to indicate no errors, and that we processed |
|
# all of it. |
|
return length |
|
|
|
def finalize(self): |
|
"""Finalize this parser, which signals to that we are finished parsing. |
|
|
|
Note: It does not currently, but in the future, it will verify that we |
|
are in the final state of the parser (i.e. the end of the multipart |
|
message is well-formed), and, if not, throw an error. |
|
""" |
|
# TODO: verify that we're in the state STATE_END, otherwise throw an |
|
# error or otherwise state that we're not finished parsing. |
|
pass |
|
|
|
def __repr__(self): |
|
return f"{self.__class__.__name__}(boundary={self.boundary!r})" |
|
|
|
|
|
class FormParser: |
|
"""This class is the all-in-one form parser. Given all the information |
|
necessary to parse a form, it will instantiate the correct parser, create |
|
the proper :class:`Field` and :class:`File` classes to store the data that |
|
is parsed, and call the two given callbacks with each field and file as |
|
they become available. |
|
|
|
:param content_type: The Content-Type of the incoming request. This is |
|
used to select the appropriate parser. |
|
|
|
:param on_field: The callback to call when a field has been parsed and is |
|
ready for usage. See above for parameters. |
|
|
|
:param on_file: The callback to call when a file has been parsed and is |
|
ready for usage. See above for parameters. |
|
|
|
:param on_end: An optional callback to call when all fields and files in a |
|
request has been parsed. Can be None. |
|
|
|
:param boundary: If the request is a multipart/form-data request, this |
|
should be the boundary of the request, as given in the |
|
Content-Type header, as a bytestring. |
|
|
|
:param file_name: If the request is of type application/octet-stream, then |
|
the body of the request will not contain any information |
|
about the uploaded file. In such cases, you can provide |
|
the file name of the uploaded file manually. |
|
|
|
:param FileClass: The class to use for uploaded files. Defaults to |
|
:class:`File`, but you can provide your own class if you |
|
wish to customize behaviour. The class will be |
|
instantiated as FileClass(file_name, field_name), and it |
|
must provide the following functions:: |
|
file_instance.write(data) |
|
file_instance.finalize() |
|
file_instance.close() |
|
|
|
:param FieldClass: The class to use for uploaded fields. Defaults to |
|
:class:`Field`, but you can provide your own class if |
|
you wish to customize behaviour. The class will be |
|
instantiated as FieldClass(field_name), and it must |
|
provide the following functions:: |
|
field_instance.write(data) |
|
field_instance.finalize() |
|
field_instance.close() |
|
|
|
:param config: Configuration to use for this FormParser. The default |
|
values are taken from the DEFAULT_CONFIG value, and then |
|
any keys present in this dictionary will overwrite the |
|
default values. |
|
|
|
""" |
|
#: This is the default configuration for our form parser. |
|
#: Note: all file sizes should be in bytes. |
|
DEFAULT_CONFIG = { |
|
'MAX_BODY_SIZE': float('inf'), |
|
'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024, |
|
'UPLOAD_DIR': None, |
|
'UPLOAD_KEEP_FILENAME': False, |
|
'UPLOAD_KEEP_EXTENSIONS': False, |
|
|
|
# Error on invalid Content-Transfer-Encoding? |
|
'UPLOAD_ERROR_ON_BAD_CTE': False, |
|
} |
|
|
|
def __init__(self, content_type, on_field, on_file, on_end=None, |
|
boundary=None, file_name=None, FileClass=File, |
|
FieldClass=Field, config={}): |
|
|
|
self.logger = logging.getLogger(__name__) |
|
|
|
# Save variables. |
|
self.content_type = content_type |
|
self.boundary = boundary |
|
self.bytes_received = 0 |
|
self.parser = None |
|
|
|
# Save callbacks. |
|
self.on_field = on_field |
|
self.on_file = on_file |
|
self.on_end = on_end |
|
|
|
# Save classes. |
|
self.FileClass = File |
|
self.FieldClass = Field |
|
|
|
# Set configuration options. |
|
self.config = self.DEFAULT_CONFIG.copy() |
|
self.config.update(config) |
|
|
|
# Depending on the Content-Type, we instantiate the correct parser. |
|
if content_type == 'application/octet-stream': |
|
# Work around the lack of 'nonlocal' in Py2 |
|
class vars: |
|
f = None |
|
|
|
def on_start(): |
|
vars.f = FileClass(file_name, None, config=self.config) |
|
|
|
def on_data(data, start, end): |
|
vars.f.write(data[start:end]) |
|
|
|
def on_end(): |
|
# Finalize the file itself. |
|
vars.f.finalize() |
|
|
|
# Call our callback. |
|
on_file(vars.f) |
|
|
|
# Call the on-end callback. |
|
if self.on_end is not None: |
|
self.on_end() |
|
|
|
callbacks = { |
|
'on_start': on_start, |
|
'on_data': on_data, |
|
'on_end': on_end, |
|
} |
|
|
|
# Instantiate an octet-stream parser |
|
parser = OctetStreamParser(callbacks, |
|
max_size=self.config['MAX_BODY_SIZE']) |
|
|
|
elif (content_type == 'application/x-www-form-urlencoded' or |
|
content_type == 'application/x-url-encoded'): |
|
|
|
name_buffer = [] |
|
|
|
class vars: |
|
f = None |
|
|
|
def on_field_start(): |
|
pass |
|
|
|
def on_field_name(data, start, end): |
|
name_buffer.append(data[start:end]) |
|
|
|
def on_field_data(data, start, end): |
|
if vars.f is None: |
|
vars.f = FieldClass(b''.join(name_buffer)) |
|
del name_buffer[:] |
|
vars.f.write(data[start:end]) |
|
|
|
def on_field_end(): |
|
# Finalize and call callback. |
|
if vars.f is None: |
|
# If we get here, it's because there was no field data. |
|
# We create a field, set it to None, and then continue. |
|
vars.f = FieldClass(b''.join(name_buffer)) |
|
del name_buffer[:] |
|
vars.f.set_none() |
|
|
|
vars.f.finalize() |
|
on_field(vars.f) |
|
vars.f = None |
|
|
|
def on_end(): |
|
if self.on_end is not None: |
|
self.on_end() |
|
|
|
# Setup callbacks. |
|
callbacks = { |
|
'on_field_start': on_field_start, |
|
'on_field_name': on_field_name, |
|
'on_field_data': on_field_data, |
|
'on_field_end': on_field_end, |
|
'on_end': on_end, |
|
} |
|
|
|
# Instantiate parser. |
|
parser = QuerystringParser( |
|
callbacks=callbacks, |
|
max_size=self.config['MAX_BODY_SIZE'] |
|
) |
|
|
|
elif content_type == 'multipart/form-data': |
|
if boundary is None: |
|
self.logger.error("No boundary given") |
|
raise FormParserError("No boundary given") |
|
|
|
header_name = [] |
|
header_value = [] |
|
headers = {} |
|
|
|
# No 'nonlocal' on Python 2 :-( |
|
class vars: |
|
f = None |
|
writer = None |
|
is_file = False |
|
|
|
def on_part_begin(): |
|
pass |
|
|
|
def on_part_data(data, start, end): |
|
bytes_processed = vars.writer.write(data[start:end]) |
|
# TODO: check for error here. |
|
return bytes_processed |
|
|
|
def on_part_end(): |
|
vars.f.finalize() |
|
if vars.is_file: |
|
on_file(vars.f) |
|
else: |
|
on_field(vars.f) |
|
|
|
def on_header_field(data, start, end): |
|
header_name.append(data[start:end]) |
|
|
|
def on_header_value(data, start, end): |
|
header_value.append(data[start:end]) |
|
|
|
def on_header_end(): |
|
headers[b''.join(header_name)] = b''.join(header_value) |
|
del header_name[:] |
|
del header_value[:] |
|
|
|
def on_headers_finished(): |
|
# Reset the 'is file' flag. |
|
vars.is_file = False |
|
|
|
# Parse the content-disposition header. |
|
# TODO: handle mixed case |
|
content_disp = headers.get(b'Content-Disposition') |
|
disp, options = parse_options_header(content_disp) |
|
|
|
# Get the field and filename. |
|
field_name = options.get(b'name') |
|
file_name = options.get(b'filename') |
|
# TODO: check for errors |
|
|
|
# Create the proper class. |
|
if file_name is None: |
|
vars.f = FieldClass(field_name) |
|
else: |
|
vars.f = FileClass(file_name, field_name, config=self.config) |
|
vars.is_file = True |
|
|
|
# Parse the given Content-Transfer-Encoding to determine what |
|
# we need to do with the incoming data. |
|
# TODO: check that we properly handle 8bit / 7bit encoding. |
|
transfer_encoding = headers.get(b'Content-Transfer-Encoding', |
|
b'7bit') |
|
|
|
if (transfer_encoding == b'binary' or |
|
transfer_encoding == b'8bit' or |
|
transfer_encoding == b'7bit'): |
|
vars.writer = vars.f |
|
|
|
elif transfer_encoding == b'base64': |
|
vars.writer = Base64Decoder(vars.f) |
|
|
|
elif transfer_encoding == b'quoted-printable': |
|
vars.writer = QuotedPrintableDecoder(vars.f) |
|
|
|
else: |
|
self.logger.warning("Unknown Content-Transfer-Encoding: " |
|
"%r", transfer_encoding) |
|
if self.config['UPLOAD_ERROR_ON_BAD_CTE']: |
|
raise FormParserError( |
|
'Unknown Content-Transfer-Encoding "{}"'.format( |
|
transfer_encoding |
|
) |
|
) |
|
else: |
|
# If we aren't erroring, then we just treat this as an |
|
# unencoded Content-Transfer-Encoding. |
|
vars.writer = vars.f |
|
|
|
def on_end(): |
|
vars.writer.finalize() |
|
if self.on_end is not None: |
|
self.on_end() |
|
|
|
# These are our callbacks for the parser. |
|
callbacks = { |
|
'on_part_begin': on_part_begin, |
|
'on_part_data': on_part_data, |
|
'on_part_end': on_part_end, |
|
'on_header_field': on_header_field, |
|
'on_header_value': on_header_value, |
|
'on_header_end': on_header_end, |
|
'on_headers_finished': on_headers_finished, |
|
'on_end': on_end, |
|
} |
|
|
|
# Instantiate a multipart parser. |
|
parser = MultipartParser(boundary, callbacks, |
|
max_size=self.config['MAX_BODY_SIZE']) |
|
|
|
else: |
|
self.logger.warning("Unknown Content-Type: %r", content_type) |
|
raise FormParserError("Unknown Content-Type: {}".format( |
|
content_type |
|
)) |
|
|
|
self.parser = parser |
|
|
|
def write(self, data): |
|
"""Write some data. The parser will forward this to the appropriate |
|
underlying parser. |
|
|
|
:param data: a bytestring |
|
""" |
|
self.bytes_received += len(data) |
|
# TODO: check the parser's return value for errors? |
|
return self.parser.write(data) |
|
|
|
def finalize(self): |
|
"""Finalize the parser.""" |
|
if self.parser is not None and hasattr(self.parser, 'finalize'): |
|
self.parser.finalize() |
|
|
|
def close(self): |
|
"""Close the parser.""" |
|
if self.parser is not None and hasattr(self.parser, 'close'): |
|
self.parser.close() |
|
|
|
def __repr__(self): |
|
return "{}(content_type={!r}, parser={!r})".format( |
|
self.__class__.__name__, |
|
self.content_type, |
|
self.parser, |
|
) |
|
|
|
|
|
def create_form_parser(headers, on_field, on_file, trust_x_headers=False, |
|
config={}): |
|
"""This function is a helper function to aid in creating a FormParser |
|
instances. Given a dictionary-like headers object, it will determine |
|
the correct information needed, instantiate a FormParser with the |
|
appropriate values and given callbacks, and then return the corresponding |
|
parser. |
|
|
|
:param headers: A dictionary-like object of HTTP headers. The only |
|
required header is Content-Type. |
|
|
|
:param on_field: Callback to call with each parsed field. |
|
|
|
:param on_file: Callback to call with each parsed file. |
|
|
|
:param trust_x_headers: Whether or not to trust information received from |
|
certain X-Headers - for example, the file name from |
|
X-File-Name. |
|
|
|
:param config: Configuration variables to pass to the FormParser. |
|
""" |
|
content_type = headers.get('Content-Type') |
|
if content_type is None: |
|
logging.getLogger(__name__).warning("No Content-Type header given") |
|
raise ValueError("No Content-Type header given!") |
|
|
|
# Boundaries are optional (the FormParser will raise if one is needed |
|
# but not given). |
|
content_type, params = parse_options_header(content_type) |
|
boundary = params.get(b'boundary') |
|
|
|
# We need content_type to be a string, not a bytes object. |
|
content_type = content_type.decode('latin-1') |
|
|
|
# File names are optional. |
|
file_name = headers.get('X-File-Name') |
|
|
|
# Instantiate a form parser. |
|
form_parser = FormParser(content_type, |
|
on_field, |
|
on_file, |
|
boundary=boundary, |
|
file_name=file_name, |
|
config=config) |
|
|
|
# Return our parser. |
|
return form_parser |
|
|
|
|
|
def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, |
|
**kwargs): |
|
"""This function is useful if you just want to parse a request body, |
|
without too much work. Pass it a dictionary-like object of the request's |
|
headers, and a file-like object for the input stream, along with two |
|
callbacks that will get called whenever a field or file is parsed. |
|
|
|
:param headers: A dictionary-like object of HTTP headers. The only |
|
required header is Content-Type. |
|
|
|
:param input_stream: A file-like object that represents the request body. |
|
The read() method must return bytestrings. |
|
|
|
:param on_field: Callback to call with each parsed field. |
|
|
|
:param on_file: Callback to call with each parsed file. |
|
|
|
:param chunk_size: The maximum size to read from the input stream and write |
|
to the parser at one time. Defaults to 1 MiB. |
|
""" |
|
|
|
# Create our form parser. |
|
parser = create_form_parser(headers, on_field, on_file) |
|
|
|
# Read chunks of 100KiB and write to the parser, but never read more than |
|
# the given Content-Length, if any. |
|
content_length = headers.get('Content-Length') |
|
if content_length is not None: |
|
content_length = int(content_length) |
|
else: |
|
content_length = float('inf') |
|
bytes_read = 0 |
|
|
|
while True: |
|
# Read only up to the Content-Length given. |
|
max_readable = min(content_length - bytes_read, 1048576) |
|
buff = input_stream.read(max_readable) |
|
|
|
# Write to the parser and update our length. |
|
parser.write(buff) |
|
bytes_read += len(buff) |
|
|
|
# If we get a buffer that's smaller than the size requested, or if we |
|
# have read up to our content length, we're done. |
|
if len(buff) != max_readable or bytes_read == content_length: |
|
break |
|
|
|
# Tell our parser that we're done writing data. |
|
parser.finalize()
|
|
|