Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ jobs:
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
env:
PIE_SOCKET_API_KEY: ${{ secrets.PIE_SOCKET_API_KEY }}
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v1
with:
Expand Down
2 changes: 1 addition & 1 deletion src/HTTP.jl
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ end
include("download.jl")
include("Servers.jl") ;using .Servers; using .Servers: listen
include("Handlers.jl") ;using .Handlers; using .Handlers: serve
include("parsemultipart.jl")
include("parsemultipart.jl") ;using .MultiPartParsing: parse_multipart_form
include("WebSockets.jl") ;using .WebSockets

import .ConnectionPool: Transaction, Connection
Expand Down
39 changes: 31 additions & 8 deletions src/IOExtras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,37 @@ _doc = """

Signal start/end of write or read operations.
"""
"$_doc"
startwrite(io) = nothing
"$_doc"
closewrite(io) = nothing
"$_doc"
startread(io) = nothing
"$_doc"
closeread(io) = nothing
if isdefined(Base, :startwrite)
"$_doc"
Base.startwrite(io) = nothing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is type piracy now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we'll need to make a plan around this since Base doesn't define a fallback for closewrite. Either we add closewrite(io) = nothing to Base and we can check if it's defined here, or it's also possible we don't need the fallback anymore. I left this as-is for now since it ensures non-breaking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why extend the function at all? You can just keep using it as a separate function and qualify it (or the Base one).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because then that's breaking; we have all these code examples all over the package where we tell people to write streaming servers doing closewrite(stream).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can still export it. Downstreams users just must then decide if they want to use Base.closewrite or HTTP.closewrite.

Copy link
Member

@DilumAluthge DilumAluthge Sep 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I would strongly prefer that we remove the type piracy and just make a breaking release of HTTP.jl.

(I don't think it's actually breaking. I think the burden is on the downstream users to make sure that they properly qualify any names that they use.

But if this is considering breaking, then I'd rather just tag the breaking release so that we can get rid of the type piracy.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

else
"$_doc"
startwrite(io) = nothing
end

if isdefined(Base, :closewrite)
"$_doc"
Base.closewrite(io) = nothing
else
"$_doc"
closewrite(io) = nothing
end

if isdefined(Base, :startread)
"$_doc"
Base.startread(io) = nothing
else
"$_doc"
startread(io) = nothing
end

if isdefined(Base, :closeread)
"$_doc"
Base.closeread(io) = nothing
else
"$_doc"
closeread(io) = nothing
end

using MbedTLS: SSLContext
tcpsocket(io::SSLContext)::TCPSocket = io.bio
Expand Down
205 changes: 92 additions & 113 deletions src/parsemultipart.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,35 @@
module MultiPartParsing

import ..access_threaded, ..Request, ..Multipart, ..payload
using ..Parsers

export parse_multipart_form

const CR_BYTE = 0x0d # \r
const LF_BYTE = 0x0a # \n
const DASH_BYTE = 0x2d # -
const HTAB_BYTE = 0x09 # \t
const SPACE_BYTE = 0x20
const RETURN_BYTES = [CR_BYTE, LF_BYTE]
const SEMICOLON_BYTE = UInt8(';')
const CRLFCRLF = (CR_BYTE, LF_BYTE, CR_BYTE, LF_BYTE)

"compare byte buffer `a` from index `i` to index `j` with `b` and check if they are byte-equal"
function byte_buffers_eq(a, i, j, b)
l = 1
@inbounds for k = i:j
a[k] == b[l] || return false
l += 1
end
return true
end

"""
find_multipart_boundary(bytes, boundaryDelimiter; start::Int=1)

Find the first and last index of the next boundary delimiting a part, and if
the discovered boundary is the terminating boundary.
"""
function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter::AbstractVector{UInt8}; start::Int=1)
@inline function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter::AbstractVector{UInt8}; start::Int=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm kind of amazed that inlining this function has any performance improvement.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, it doesn't. This was from an iteration where I was returning a SubString instead of indices and it could avoid the allocation if it was inlined.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So should it have been removed?

# The boundary delimiter line is prepended with two '-' characters
# The boundary delimiter line starts on a new line, so must be preceded by a \r\n.
# The boundary delimiter line ends with \r\n, and can have "optional linear whitespace" between
Expand All @@ -20,26 +38,26 @@ function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter
# [RFC2046 5.1.1](https://tools.ietf.org/html/rfc2046#section-5.1.1)

i = start
end_index = i+length(boundaryDelimiter)-1
end_index = i + length(boundaryDelimiter) + 1
while end_index <= length(bytes)
if bytes[i:end_index] == boundaryDelimiter
if bytes[i] == DASH_BYTE && bytes[i + 1] == DASH_BYTE && byte_buffers_eq(bytes, i + 2, end_index, boundaryDelimiter)
# boundary delimiter line start on a new line ...
if i > 1
(i == 2 || bytes[i-2] != CR_BYTE || bytes[i-1] != LF_BYTE) && error("boundary delimiter found, but it was not the start of a line")
# the CRLF preceding the boundary delimiter is "conceptually attached
# to the boundary", so account for this with the index
i-=2
i -= 2
end

# need to check if there are enough characters for the CRLF or for two dashes
end_index < length(bytes)-1 || error("boundary delimiter found, but did not end with new line")

is_terminating_delimiter = bytes[end_index+1] == DASH_BYTE && bytes[end_index+2] == DASH_BYTE
is_terminating_delimiter && (end_index+=2)
is_terminating_delimiter && (end_index += 2)

# ... there can be arbitrary SP and HTAB space between the boundary delimiter ...
while (end_index < length(bytes) && (bytes[end_index+1] in [HTAB_BYTE, SPACE_BYTE]))
end_index+=1
while end_index < length(bytes) && (bytes[end_index+1] in (HTAB_BYTE, SPACE_BYTE))
end_index += 1
end
# ... and ends with a new line
newlineEnd = end_index < length(bytes)-1 &&
Expand Down Expand Up @@ -89,107 +107,45 @@ start index(1) and the end index. Headers are separated from the body by CRLFCRL
[RFC822 3.1](https://tools.ietf.org/html/rfc822#section-3.1)
"""
function find_header_boundary(bytes::AbstractVector{UInt8})
delimiter = UInt8[CR_BYTE, LF_BYTE, CR_BYTE, LF_BYTE]
length(delimiter) > length(bytes) && (return nothing)
length(CRLFCRLF) > length(bytes) && return nothing

l = length(bytes) - length(delimiter) + 1
l = length(bytes) - length(CRLFCRLF) + 1
i = 1
end_index = length(delimiter)
end_index = length(CRLFCRLF)
while (i <= l)
bytes[i:end_index] == delimiter && (return (1, end_index))
byte_buffers_eq(bytes, i, end_index, CRLFCRLF) && return (1, end_index)
i += 1
end_index += 1
end
error("no delimiter found separating header from multipart body")
end

"""
content_disposition_tokenize(str)

Tokenize the "arguments" for the Content-Disposition declaration. A vector of
strings is returned that contains each token and separator found in the source
string. Tokens are separated by either an equal sign(=) or a semi-colon(;) and
may be quoted or escaped with a backslash(\\). All tokens returned are stripped
of whitespace at the beginning and end of the string, quotes are retained.
"""
function content_disposition_tokenize(str)
retval = Vector{SubString}()
start = 1
quotes = false
escaped = false

for offset in eachindex(str)
if escaped == false
if quotes == true && str[offset] == '"'
quotes = false
elseif str[offset] == '\\'
escaped = true
elseif str[offset] == '"'
quotes = true
elseif quotes == false && (str[offset] == ';' || str[offset] == '=')
prev = prevind(str, offset)
if prev > start
push!(retval, strip(SubString(str, start, prev)))
end
push!(retval, SubString(str, offset, offset))
start = nextind(str, offset)
end
else
escaped = false
end
end

if start != lastindex(str)
push!(retval, strip(SubString(str, start)))
end

return retval
const content_disposition_regex = Parsers.RegexAndMatchData[]
function content_disposition_regex_f()
r = Parsers.RegexAndMatchData(r"^Content-Disposition:[ \t]*form-data;[ \t]*(.*)\r\n"x)
Parsers.init!(r)
end

"""
content_disposition_extract(str)

Extract all the flags and key/value arguments from the Content-Disposition
line. The result is returned as an array of tuples.

In the case of a flag the first value of the tuple is false the second value
is the flag and the third value is nothing.

In the case of a key/value argument the first value is true, the second is the
key, and the third is the value (or nothing if no value was specified).
"""
function content_disposition_extract(str)
retval = Vector{Tuple{Bool, SubString, Union{SubString,Nothing}}}()
tokens = content_disposition_tokenize(str)
total = length(tokens)

function strip_quotes(val)
if val[1] == '"' && val[end] == '"'
SubString(val, 2, lastindex(val) - 1)
else
val
end
end

i = 1
while i < total
if tokens[i] != ';'
pair = (i + 1 <= total && tokens[i + 1] == "=")
key = strip_quotes(tokens[i])
value = (pair && i + 2 <= total && tokens[i + 2] != ";" ? strip_quotes(tokens[i + 2]) : nothing)
const content_disposition_flag_regex = Parsers.RegexAndMatchData[]
function content_disposition_flag_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+);?
"""x)
Parsers.init!(r)
end

push!(retval, (pair, key, value))
const content_disposition_pair_regex = Parsers.RegexAndMatchData[]
function content_disposition_pair_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+)[ \t]*=[ \t]*"(.*?)";?
"""x)
Parsers.init!(r)
end

if pair
i += 3
else
i += 1
end
else
i += 1
end
end
return retval
const content_type_regex = Parsers.RegexAndMatchData[]
function content_type_regex_f()
r = Parsers.RegexAndMatchData(r"(?i)Content-Type: (\S*[^;\s])"x)
Parsers.init!(r)
end

"""
Expand All @@ -199,33 +155,46 @@ Parse a single multi-part chunk into a Multipart object. This will decode
the header and extract the contents from the byte array.
"""
function parse_multipart_chunk(chunk)
(startIndex, end_index) = find_header_boundary(chunk)

headers = String(view(chunk, startIndex:end_index))
startIndex, end_index = find_header_boundary(chunk)
header = SubString(unsafe_string(pointer(chunk, startIndex), end_index - startIndex + 1))
content = view(chunk, end_index+1:lastindex(chunk))

disposition = match(r"(?i)Content-Disposition: form-data(.*)\r\n", headers)

if disposition === nothing
@warn "Content disposition is not specified dropping the chunk." chunk
return # Specifying content disposition is mandatory
# find content disposition
re = access_threaded(content_disposition_regex_f, content_disposition_regex)
if !Parsers.exec(re, header)
@warn "Content disposition is not specified dropping the chunk." String(chunk)
return nothing # Specifying content disposition is mandatory
end
content_disposition = Parsers.group(1, re, header)

re_flag = access_threaded(content_disposition_flag_regex_f, content_disposition_flag_regex)
re_pair = access_threaded(content_disposition_pair_regex_f, content_disposition_pair_regex)
name = nothing
filename = nothing

for (pair, key, value) in content_disposition_extract(disposition[1])
if pair && key == "name"
name = value
elseif pair && key == "filename"
filename = value
while !isempty(content_disposition)
if Parsers.exec(re_pair, content_disposition)
key = Parsers.group(1, re_pair, content_disposition)
value = Parsers.group(2, re_pair, content_disposition)
if key == "name"
name = value
elseif key == "filename"
filename = value
else
# do stuff with other content disposition key-value pairs
end
content_disposition = Parsers.nextbytes(re_pair, content_disposition)
elseif Parsers.exec(re_flag, content_disposition)
# do stuff with content disposition flags
content_disposition = Parsers.nextbytes(re_flag, content_disposition)
else
break
end
end

name === nothing && return

match_contenttype = match(r"(?i)Content-Type: (\S*[^;\s])", headers)
contenttype = match_contenttype !== nothing ? match_contenttype[1] : "text/plain" # if content_type is not specified, the default text/plain is assumed
re_ct = access_threaded(content_type_regex_f, content_type_regex)
contenttype = Parsers.exec(re_ct, header) ? Parsers.group(1, re_ct, header) : "text/plain"

return Multipart(filename, IOBuffer(content), contenttype, "", name)
end
Expand All @@ -238,7 +207,7 @@ chunks which are returned as an array of Multipart objects.
"""
function parse_multipart_body(body::AbstractVector{UInt8}, boundary::AbstractString)::Vector{Multipart}
multiparts = Multipart[]
idxs = find_multipart_boundaries(body, Vector{UInt8}("--$(boundary)"))
idxs = find_multipart_boundaries(body, codeunits(boundary))
length(idxs) > 1 || (return multiparts)

for i in 1:length(idxs)-1
Expand Down Expand Up @@ -275,3 +244,13 @@ function parse_multipart_form(req::Request)::Union{Vector{Multipart}, Nothing}

return parse_multipart_body(payload(req), boundary_delimiter)
end

function __init__()
resize!(empty!(content_disposition_regex), Threads.nthreads())
resize!(empty!(content_disposition_flag_regex), Threads.nthreads())
resize!(empty!(content_disposition_pair_regex), Threads.nthreads())
resize!(empty!(content_type_regex), Threads.nthreads())
return
end

end # module MultiPartParsing
9 changes: 5 additions & 4 deletions test/chunking.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
using Test
using HTTP
using HTTP, HTTP.IOExtras
using BufferedStreams

# For more information see: https://github.com/JuliaWeb/HTTP.jl/pull/288
@testset "Chunking" begin
sz = 90
port = 8095
hex(n) = string(n, base=16)
encoded_data = "$(hex(sz + 9))\r\n" * "data: 1$(repeat("x", sz))\n\n" * "\r\n" *
"$(hex(sz + 9))\r\n" * "data: 2$(repeat("x", sz))\n\n" * "\r\n" *
Expand All @@ -15,7 +16,7 @@ using BufferedStreams
split1 = 106
split2 = 300

t = @async HTTP.listen("127.0.0.1", 8091) do http
t = @async HTTP.listen("127.0.0.1", port) do http
startwrite(http)
tcp = http.stream.c.io

Expand All @@ -34,7 +35,7 @@ using BufferedStreams
sleep(1)
@assert !istaskdone(t)

r = HTTP.get("http://127.0.0.1:8091")
r = HTTP.get("http://127.0.0.1:$port")

@test String(r.body) == decoded_data

Expand All @@ -44,7 +45,7 @@ using BufferedStreams
# Ignore byte-by-byte read warning
CL = Base.CoreLogging
CL.with_logger(CL.SimpleLogger(stderr, CL.Error)) do
HTTP.open("GET", "http://127.0.0.1:8091") do io
HTTP.open("GET", "http://127.0.0.1:$port") do io
io = wrap(io)
x = split(decoded_data, "\n")

Expand Down
Loading