You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
4.7 KiB
132 lines
4.7 KiB
# We use native strings for all the re patterns, to take advantage of string |
|
# formatting, and then convert to bytestrings when compiling the final re |
|
# objects. |
|
|
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#whitespace |
|
# OWS = *( SP / HTAB ) |
|
# ; optional whitespace |
|
OWS = r"[ \t]*" |
|
|
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.token.separators |
|
# token = 1*tchar |
|
# |
|
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" |
|
# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" |
|
# / DIGIT / ALPHA |
|
# ; any VCHAR, except delimiters |
|
token = r"[-!#$%&'*+.^_`|~0-9a-zA-Z]+" |
|
|
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#header.fields |
|
# field-name = token |
|
field_name = token |
|
|
|
# The standard says: |
|
# |
|
# field-value = *( field-content / obs-fold ) |
|
# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] |
|
# field-vchar = VCHAR / obs-text |
|
# obs-fold = CRLF 1*( SP / HTAB ) |
|
# ; obsolete line folding |
|
# ; see Section 3.2.4 |
|
# |
|
# https://tools.ietf.org/html/rfc5234#appendix-B.1 |
|
# |
|
# VCHAR = %x21-7E |
|
# ; visible (printing) characters |
|
# |
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.quoted-string |
|
# obs-text = %x80-FF |
|
# |
|
# However, the standard definition of field-content is WRONG! It disallows |
|
# fields containing a single visible character surrounded by whitespace, |
|
# e.g. "foo a bar". |
|
# |
|
# See: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189 |
|
# |
|
# So our definition of field_content attempts to fix it up... |
|
# |
|
# Also, we allow lots of control characters, because apparently people assume |
|
# that they're legal in practice (e.g., google analytics makes cookies with |
|
# \x01 in them!): |
|
# https://github.com/python-hyper/h11/issues/57 |
|
# We still don't allow NUL or whitespace, because those are often treated as |
|
# meta-characters and letting them through can lead to nasty issues like SSRF. |
|
vchar = r"[\x21-\x7e]" |
|
vchar_or_obs_text = r"[^\x00\s]" |
|
field_vchar = vchar_or_obs_text |
|
field_content = r"{field_vchar}+(?:[ \t]+{field_vchar}+)*".format(**globals()) |
|
|
|
# We handle obs-fold at a different level, and our fixed-up field_content |
|
# already grows to swallow the whole value, so ? instead of * |
|
field_value = r"({field_content})?".format(**globals()) |
|
|
|
# header-field = field-name ":" OWS field-value OWS |
|
header_field = ( |
|
r"(?P<field_name>{field_name})" |
|
r":" |
|
r"{OWS}" |
|
r"(?P<field_value>{field_value})" |
|
r"{OWS}".format(**globals()) |
|
) |
|
|
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#request.line |
|
# |
|
# request-line = method SP request-target SP HTTP-version CRLF |
|
# method = token |
|
# HTTP-version = HTTP-name "/" DIGIT "." DIGIT |
|
# HTTP-name = %x48.54.54.50 ; "HTTP", case-sensitive |
|
# |
|
# request-target is complicated (see RFC 7230 sec 5.3) -- could be path, full |
|
# URL, host+port (for connect), or even "*", but in any case we are guaranteed |
|
# that it contists of the visible printing characters. |
|
method = token |
|
request_target = r"{vchar}+".format(**globals()) |
|
http_version = r"HTTP/(?P<http_version>[0-9]\.[0-9])" |
|
request_line = ( |
|
r"(?P<method>{method})" |
|
r" " |
|
r"(?P<target>{request_target})" |
|
r" " |
|
r"{http_version}".format(**globals()) |
|
) |
|
|
|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#status.line |
|
# |
|
# status-line = HTTP-version SP status-code SP reason-phrase CRLF |
|
# status-code = 3DIGIT |
|
# reason-phrase = *( HTAB / SP / VCHAR / obs-text ) |
|
status_code = r"[0-9]{3}" |
|
reason_phrase = r"([ \t]|{vchar_or_obs_text})*".format(**globals()) |
|
status_line = ( |
|
r"{http_version}" |
|
r" " |
|
r"(?P<status_code>{status_code})" |
|
# However, there are apparently a few too many servers out there that just |
|
# leave out the reason phrase: |
|
# https://github.com/scrapy/scrapy/issues/345#issuecomment-281756036 |
|
# https://github.com/seanmonstar/httparse/issues/29 |
|
# so make it optional. ?: is a non-capturing group. |
|
r"(?: (?P<reason>{reason_phrase}))?".format(**globals()) |
|
) |
|
|
|
HEXDIG = r"[0-9A-Fa-f]" |
|
# Actually |
|
# |
|
# chunk-size = 1*HEXDIG |
|
# |
|
# but we impose an upper-limit to avoid ridiculosity. len(str(2**64)) == 20 |
|
chunk_size = r"({HEXDIG}){{1,20}}".format(**globals()) |
|
# Actually |
|
# |
|
# chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) |
|
# |
|
# but we aren't parsing the things so we don't really care. |
|
chunk_ext = r";.*" |
|
chunk_header = ( |
|
r"(?P<chunk_size>{chunk_size})" |
|
r"(?P<chunk_ext>{chunk_ext})?" |
|
r"{OWS}\r\n".format( |
|
**globals() |
|
) # Even though the specification does not allow for extra whitespaces, |
|
# we are lenient with trailing whitespaces because some servers on the wild use it. |
|
)
|
|
|