Skip to content

Commit 541da11

Browse files
Custom Url Parser (#730)
* Custom Url parser for our needs * lint fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix proxy_pool plugin as scheme can be None if not present in the Url * Address the ambiguous ipv6:port scenario along with valid cases * lint checks * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docstring * Abstract into `http.parser` module * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix #398 HTTP/1.0 related issue * lint checks Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 0739db6 commit 541da11

30 files changed

+414
-167
lines changed

examples/https_connect_tunnel.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
from proxy import Proxy
1616
from proxy.common.utils import build_http_response
17-
from proxy.http.codes import httpStatusCodes
18-
from proxy.http.parser import httpParserStates
17+
from proxy.http.parser import httpParserStates, httpStatusCodes
1918
from proxy.core.base import BaseTcpTunnelHandler
2019

2120

proxy/common/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def _env_threadless_compliant() -> bool:
4444
COMMA = b','
4545
DOT = b'.'
4646
SLASH = b'/'
47+
HTTP_1_0 = b'HTTP/1.0'
4748
HTTP_1_1 = b'HTTP/1.1'
4849

4950
PROXY_AGENT_HEADER_KEY = b'Proxy-agent'

proxy/dashboard/dashboard.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@
1616

1717
from ..common.utils import build_http_response, bytes_
1818
from ..http.server import HttpWebServerPlugin, HttpWebServerBasePlugin, httpProtocolTypes
19-
from ..http.parser import HttpParser
19+
from ..http.parser import HttpParser, httpStatusCodes
2020
from ..http.websocket import WebsocketFrame
21-
from ..http.codes import httpStatusCodes
2221

2322
logger = logging.getLogger(__name__)
2423

proxy/http/exception/proxy_auth_failed.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
:license: BSD, see LICENSE for more details.
1010
"""
1111
from .base import HttpProtocolException
12-
from ..parser import HttpParser
13-
from ..codes import httpStatusCodes
12+
from ..parser import HttpParser, httpStatusCodes
1413

1514
from ...common.constants import PROXY_AGENT_HEADER_VALUE, PROXY_AGENT_HEADER_KEY
1615
from ...common.utils import build_http_response

proxy/http/exception/proxy_conn_failed.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
:license: BSD, see LICENSE for more details.
1010
"""
1111
from .base import HttpProtocolException
12-
from ..parser import HttpParser
13-
from ..codes import httpStatusCodes
12+
from ..parser import HttpParser, httpStatusCodes
1413

1514
from ...common.constants import PROXY_AGENT_HEADER_VALUE, PROXY_AGENT_HEADER_KEY
1615
from ...common.utils import build_http_response

proxy/http/parser/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
proxy.py
4+
~~~~~~~~
5+
⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
6+
Network monitoring, controls & Application development, testing, debugging.
7+
8+
:copyright: (c) 2013-present by Abhinav Singh and contributors.
9+
:license: BSD, see LICENSE for more details.
10+
"""
11+
from .parser import HttpParser, httpParserTypes, httpParserStates
12+
from .chunk import ChunkParser, chunkParserStates
13+
from .codes import httpStatusCodes
14+
from .url import Url
15+
from .methods import httpMethods
16+
17+
__all__ = [
18+
'HttpParser',
19+
'httpParserTypes',
20+
'httpParserStates',
21+
'ChunkParser',
22+
'chunkParserStates',
23+
'httpStatusCodes',
24+
'Url',
25+
'httpMethods',
26+
]
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
"""
1111
from typing import NamedTuple, Tuple, List, Optional
1212

13-
from ..common.utils import bytes_, find_http_line
14-
from ..common.constants import CRLF, DEFAULT_BUFFER_SIZE
13+
from ...common.utils import bytes_, find_http_line
14+
from ...common.constants import CRLF, DEFAULT_BUFFER_SIZE
1515

1616

1717
ChunkParserStates = NamedTuple(
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
('NETWORK_CONNECT_TIMEOUT_ERROR', int),
4141
],
4242
)
43+
4344
httpStatusCodes = HttpStatusCodes(
4445
100, 101,
4546
200,
Lines changed: 80 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@
88
:copyright: (c) 2013-present by Abhinav Singh and contributors.
99
:license: BSD, see LICENSE for more details.
1010
"""
11-
from urllib import parse as urlparse
1211
from typing import TypeVar, NamedTuple, Optional, Dict, Type, Tuple, List
1312

14-
from .methods import httpMethods
15-
from .chunk_parser import ChunkParser, chunkParserStates
16-
17-
from ..common.constants import DEFAULT_DISABLE_HEADERS, COLON, SLASH, CRLF, WHITESPACE, HTTP_1_1, DEFAULT_HTTP_PORT
18-
from ..common.utils import build_http_request, build_http_response, find_http_line, text_
13+
from ...common.constants import DEFAULT_DISABLE_HEADERS, COLON, HTTP_1_0, SLASH, CRLF
14+
from ...common.constants import WHITESPACE, HTTP_1_1, DEFAULT_HTTP_PORT
15+
from ...common.utils import build_http_request, build_http_response, find_http_line, text_
1916

17+
from .url import Url
18+
from .methods import httpMethods
19+
from .chunk import ChunkParser, chunkParserStates
2020

2121
HttpParserStates = NamedTuple(
2222
'HttpParserStates', [
@@ -58,44 +58,26 @@ class HttpParser:
5858
def __init__(self, parser_type: int) -> None:
5959
self.type: int = parser_type
6060
self.state: int = httpParserStates.INITIALIZED
61-
6261
self.host: Optional[bytes] = None
6362
self.port: Optional[int] = None
6463
self.path: Optional[bytes] = None
6564
self.method: Optional[bytes] = None
6665
self.code: Optional[bytes] = None
6766
self.reason: Optional[bytes] = None
6867
self.version: Optional[bytes] = None
69-
7068
# Total size of raw bytes passed for parsing
7169
self.total_size: int = 0
72-
7370
# Buffer to hold unprocessed bytes
7471
self.buffer: bytes = b''
75-
76-
# Keys are lower case header names
77-
# Values are 2-tuple containing original
78-
# header and it's value as received.
72+
# Internal headers datastructure:
73+
# - Keys are lower case header names.
74+
# - Values are 2-tuple containing original
75+
# header and it's value as received.
7976
self.headers: Dict[bytes, Tuple[bytes, bytes]] = {}
8077
self.body: Optional[bytes] = None
81-
82-
self.chunk_parser: Optional[ChunkParser] = None
83-
84-
# TODO: Deprecate me, we don't need this in core.
85-
#
86-
# Deprecated since v2.4.0
87-
#
88-
# This is mostly for developers so that they can directly
89-
# utilize a url object, but is unnecessary as parser
90-
# provides all the necessary parsed information.
91-
#
92-
# But developers can utilize urlsplit or whatever
93-
# library they are using when necessary. This will certainly
94-
# give some performance boost as url parsing won't be needed
95-
# for every request/response object.
96-
#
97-
# (except query string and fragments)
98-
self._url: Optional[urlparse.SplitResultBytes] = None
78+
self.chunk: Optional[ChunkParser] = None
79+
# Internal request line as a url structure
80+
self._url: Optional[Url] = None
9981

10082
@classmethod
10183
def request(cls: Type[T], raw: bytes) -> T:
@@ -110,50 +92,74 @@ def response(cls: Type[T], raw: bytes) -> T:
11092
return parser
11193

11294
def header(self, key: bytes) -> bytes:
95+
"""Convenient method to return original header value from internal datastructure."""
11396
if key.lower() not in self.headers:
11497
raise KeyError('%s not found in headers', text_(key))
11598
return self.headers[key.lower()][1]
11699

117100
def has_header(self, key: bytes) -> bool:
101+
"""Returns true if header key was found in payload."""
118102
return key.lower() in self.headers
119103

120104
def add_header(self, key: bytes, value: bytes) -> None:
105+
"""Add/Update a header to internal data structure."""
121106
self.headers[key.lower()] = (key, value)
122107

123108
def add_headers(self, headers: List[Tuple[bytes, bytes]]) -> None:
109+
"""Add/Update multiple headers to internal data structure"""
124110
for (key, value) in headers:
125111
self.add_header(key, value)
126112

127113
def del_header(self, header: bytes) -> None:
114+
"""Delete a header from internal data structure."""
128115
if header.lower() in self.headers:
129116
del self.headers[header.lower()]
130117

131118
def del_headers(self, headers: List[bytes]) -> None:
119+
"""Delete headers from internal datastructure."""
132120
for key in headers:
133121
self.del_header(key.lower())
134122

135123
def set_url(self, url: bytes) -> None:
136-
# Work around with urlsplit semantics.
137-
#
138-
# For CONNECT requests, request line contains
139-
# upstream_host:upstream_port which is not complaint
140-
# with urlsplit, which expects a fully qualified url.
141-
if self.is_https_tunnel():
142-
url = b'https://' + url
143-
self._url = urlparse.urlsplit(url)
124+
"""Given a request line, parses it and sets line attributes a.k.a. host, port, path."""
125+
self._url = Url.from_bytes(url)
144126
self._set_line_attributes()
145127

128+
def has_host(self) -> bool:
129+
"""Returns whether host line attribute was parsed or set.
130+
131+
NOTE: Host field WILL be None for incoming local WebServer requests."""
132+
return self.host is not None
133+
134+
def is_http_1_1_keep_alive(self) -> bool:
135+
"""Returns true for HTTP/1.1 keep-alive connections."""
136+
return self.version == HTTP_1_1 and \
137+
(
138+
not self.has_header(b'Connection') or
139+
self.header(b'Connection').lower() == b'keep-alive'
140+
)
141+
142+
def is_connection_upgrade(self) -> bool:
143+
"""Returns true for websocket upgrade requests."""
144+
return self.version == HTTP_1_1 and \
145+
self.has_header(b'Connection') and \
146+
self.has_header(b'Upgrade')
147+
146148
def is_https_tunnel(self) -> bool:
149+
"""Returns true for HTTPS CONNECT tunnel request."""
147150
return self.method == httpMethods.CONNECT
148151

149152
def is_chunked_encoded(self) -> bool:
153+
"""Returns true if transfer-encoding chunked is used."""
150154
return b'transfer-encoding' in self.headers and \
151155
self.headers[b'transfer-encoding'][1].lower() == b'chunked'
152156

153157
def content_expected(self) -> bool:
158+
"""Returns true if content-length is present and not 0."""
154159
return b'content-length' in self.headers and int(self.header(b'content-length')) > 0
155160

156161
def body_expected(self) -> bool:
162+
"""Returns true if content or chunked response is expected."""
157163
return self.content_expected() or self.is_chunked_encoded()
158164

159165
def parse(self, raw: bytes) -> None:
@@ -173,20 +179,20 @@ def parse(self, raw: bytes) -> None:
173179

174180
def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes:
175181
"""Rebuild the request object."""
176-
assert self.method and self.version and self.path and self.type == httpParserTypes.REQUEST_PARSER
182+
assert self.method and self.version and self.type == httpParserTypes.REQUEST_PARSER
177183
if disable_headers is None:
178184
disable_headers = DEFAULT_DISABLE_HEADERS
179185
body: Optional[bytes] = self._get_body_or_chunks()
180-
path = self.path
186+
path = self.path or b'/'
181187
if for_proxy:
182-
assert self._url and self.host and self.port and self.path
188+
assert self.host and self.port and self._url
183189
path = (
184-
self._url.scheme +
190+
b'http' if not self._url.scheme else self._url.scheme +
185191
COLON + SLASH + SLASH +
186192
self.host +
187193
COLON +
188194
str(self.port).encode() +
189-
self.path
195+
path
190196
) if not self.is_https_tunnel() else (self.host + COLON + str(self.port).encode())
191197
return build_http_request(
192198
self.method, path, self.version,
@@ -210,24 +216,26 @@ def build_response(self) -> bytes:
210216
body=self._get_body_or_chunks(),
211217
)
212218

213-
def has_host(self) -> bool:
214-
"""Host field SHOULD be None for incoming local WebServer requests."""
215-
return self.host is not None
216-
217-
def is_http_1_1_keep_alive(self) -> bool:
218-
return self.version == HTTP_1_1 and \
219-
(
220-
not self.has_header(b'Connection') or
221-
self.header(b'Connection').lower() == b'keep-alive'
222-
)
223-
224-
def is_connection_upgrade(self) -> bool:
225-
return self.version == HTTP_1_1 and \
226-
self.has_header(b'Connection') and \
227-
self.has_header(b'Upgrade')
228-
229219
def _process_body(self, raw: bytes) -> Tuple[bool, bytes]:
230-
if b'content-length' in self.headers:
220+
# Ref: http://www.ietf.org/rfc/rfc2616.txt
221+
# 3.If a Content-Length header field (section 14.13) is present, its
222+
# decimal value in OCTETs represents both the entity-length and the
223+
# transfer-length. The Content-Length header field MUST NOT be sent
224+
# if these two lengths are different (i.e., if a Transfer-Encoding
225+
# header field is present). If a message is received with both a
226+
# Transfer-Encoding header field and a Content-Length header field,
227+
# the latter MUST be ignored.
228+
#
229+
# TL;DR -- Give transfer-encoding header preference over content-length.
230+
if self.is_chunked_encoded():
231+
if not self.chunk:
232+
self.chunk = ChunkParser()
233+
raw = self.chunk.parse(raw)
234+
if self.chunk.state == chunkParserStates.COMPLETE:
235+
self.body = self.chunk.body
236+
self.state = httpParserStates.COMPLETE
237+
more = False
238+
elif b'content-length' in self.headers:
231239
self.state = httpParserStates.RCVING_BODY
232240
if self.body is None:
233241
self.body = b''
@@ -238,19 +246,17 @@ def _process_body(self, raw: bytes) -> Tuple[bool, bytes]:
238246
len(self.body) == int(self.header(b'content-length')):
239247
self.state = httpParserStates.COMPLETE
240248
more, raw = len(raw) > 0, raw[total_size - received_size:]
241-
elif self.is_chunked_encoded():
242-
if not self.chunk_parser:
243-
self.chunk_parser = ChunkParser()
244-
raw = self.chunk_parser.parse(raw)
245-
if self.chunk_parser.state == chunkParserStates.COMPLETE:
246-
self.body = self.chunk_parser.body
247-
self.state = httpParserStates.COMPLETE
248-
more = False
249249
else:
250-
raise NotImplementedError(
251-
'Parser shouldn\'t have reached here. ' +
252-
'This can happen when content length header is missing but their is a body in the payload',
253-
)
250+
# HTTP/1.0 scenario only
251+
assert self.version == HTTP_1_0
252+
self.state = httpParserStates.RCVING_BODY
253+
# Received a packet without content-length header
254+
# and no transfer-encoding specified.
255+
#
256+
# Ref https://github.com/abhinavsingh/proxy.py/issues/398
257+
# See TestHttpParser.test_issue_398 scenario
258+
self.body = raw
259+
more, raw = False, b''
254260
return more, raw
255261

256262
def _process_line_and_headers(self, raw: bytes) -> Tuple[bool, bytes]:
@@ -319,16 +325,4 @@ def _set_line_attributes(self) -> None:
319325
'Invalid request. Method: %r, Url: %r' %
320326
(self.method, self._url),
321327
)
322-
self.path = self._build_path()
323-
324-
def _build_path(self) -> bytes:
325-
if not self._url:
326-
return b'/None'
327-
url = self._url.path
328-
if url == b'':
329-
url = b'/'
330-
if not self._url.query == b'':
331-
url += b'?' + self._url.query
332-
if not self._url.fragment == b'':
333-
url += b'#' + self._url.fragment
334-
return url
328+
self.path = self._url.remainder

0 commit comments

Comments
 (0)