Skip to content

Commit 8b7a136

Browse files
committed
PYTHON-1044 - Fix up unknown BSON type handing
1 parent 0a5ef8d commit 8b7a136

File tree

3 files changed

+146
-50
lines changed

3 files changed

+146
-50
lines changed

bson/__init__.py

Lines changed: 78 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,15 @@
9393
_CODEC_OPTIONS_TYPE_ERROR = TypeError(
9494
"codec_options must be an instance of bson.codec_options.CodecOptions")
9595

96-
def _get_int(data, position, as_class=None,
96+
97+
def _raise_unknown_type(element_type, element_name):
98+
"""Unknown type helper."""
99+
raise InvalidBSON("Detected unknown BSON type %r for fieldname %r. Are "
100+
"you using the latest driver version?" % (
101+
element_type, element_name))
102+
103+
104+
def _get_int(data, position, name, as_class=None,
97105
tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE,
98106
compile_re=True, unsigned=False):
99107
format = unsigned and "I" or "i"
@@ -137,13 +145,15 @@ def _make_c_string(string, check_null=False):
137145
"UTF-8: %r" % string)
138146

139147

140-
def _get_number(data, position, as_class, tz_aware, uuid_subtype, compile_re):
148+
def _get_number(
149+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
141150
num = struct.unpack("<d", data[position:position + 8])[0]
142151
position += 8
143152
return num, position
144153

145154

146-
def _get_string(data, position, as_class, tz_aware, uuid_subtype, compile_re):
155+
def _get_string(
156+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
147157
length = struct.unpack("<i", data[position:position + 4])[0]
148158
if length <= 0 or (len(data) - position - 4) < length:
149159
raise InvalidBSON("invalid string length")
@@ -153,7 +163,8 @@ def _get_string(data, position, as_class, tz_aware, uuid_subtype, compile_re):
153163
return _get_c_string(data, position, length - 1)
154164

155165

156-
def _get_object(data, position, as_class, tz_aware, uuid_subtype, compile_re):
166+
def _get_object(
167+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
157168
obj_size = struct.unpack("<i", data[position:position + 4])[0]
158169
if data[position + obj_size - 1:position + obj_size] != ZERO:
159170
raise InvalidBSON("bad eoo")
@@ -168,26 +179,43 @@ def _get_object(data, position, as_class, tz_aware, uuid_subtype, compile_re):
168179
return object, position
169180

170181

171-
def _get_array(data, position, as_class, tz_aware, uuid_subtype, compile_re):
172-
obj, position = _get_object(data, position,
173-
as_class, tz_aware, uuid_subtype, compile_re)
182+
def _get_array(
183+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
184+
size = struct.unpack("<i", data[position:position + 4])[0]
185+
end = position + size - 1
186+
if data[end:end + 1] != ZERO:
187+
raise InvalidBSON("bad eoo")
188+
189+
position += 4
190+
end -= 1
174191
result = []
175-
i = 0
176-
while True:
192+
193+
# Avoid doing global and attibute lookups in the loop.
194+
append = result.append
195+
index = data.index
196+
getter = _element_getter
197+
198+
while position < end:
199+
element_type = data[position:position + 1]
200+
# Just skip the keys.
201+
position = index(ZERO, position) + 1
177202
try:
178-
result.append(obj[str(i)])
179-
i += 1
203+
value, position = getter[element_type](
204+
data, position, name,
205+
as_class, tz_aware, uuid_subtype, compile_re)
180206
except KeyError:
181-
break
182-
return result, position
207+
_raise_unknown_type(element_type, name)
208+
append(value)
209+
return result, position + 1
183210

184211

185-
def _get_binary(data, position, as_class, tz_aware, uuid_subtype, compile_re):
186-
length, position = _get_int(data, position)
212+
def _get_binary(
213+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
214+
length, position = _get_int(data, position, name)
187215
subtype = ord(data[position:position + 1])
188216
position += 1
189217
if subtype == 2:
190-
length2, position = _get_int(data, position)
218+
length2, position = _get_int(data, position, name)
191219
if length2 != length - 4:
192220
raise InvalidBSON("invalid binary (st 2) - lengths don't match!")
193221
length = length2
@@ -213,20 +241,22 @@ def _get_binary(data, position, as_class, tz_aware, uuid_subtype, compile_re):
213241
return value, position
214242

215243

216-
def _get_oid(data, position, as_class=None,
244+
def _get_oid(data, position, name, as_class=None,
217245
tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, compile_re=True):
218246
value = ObjectId(data[position:position + 12])
219247
position += 12
220248
return value, position
221249

222250

223-
def _get_boolean(data, position, as_class, tz_aware, uuid_subtype, compile_re):
251+
def _get_boolean(
252+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
224253
value = data[position:position + 1] == ONE
225254
position += 1
226255
return value, position
227256

228257

229-
def _get_date(data, position, as_class, tz_aware, uuid_subtype, compile_re):
258+
def _get_date(
259+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
230260
millis = struct.unpack("<q", data[position:position + 8])[0]
231261
diff = millis % 1000
232262
seconds = (millis - diff) / 1000
@@ -238,27 +268,30 @@ def _get_date(data, position, as_class, tz_aware, uuid_subtype, compile_re):
238268
return dt.replace(microsecond=diff * 1000), position
239269

240270

241-
def _get_code(data, position, as_class, tz_aware, uuid_subtype, compile_re):
242-
code, position = _get_string(data, position,
271+
def _get_code(
272+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
273+
code, position = _get_string(data, position, name,
243274
as_class, tz_aware, uuid_subtype, compile_re)
244275
return Code(code), position
245276

246277

247278
def _get_code_w_scope(
248-
data, position, as_class, tz_aware, uuid_subtype, compile_re):
249-
_, position = _get_int(data, position)
250-
code, position = _get_string(data, position,
279+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
280+
_, position = _get_int(data, position, name)
281+
code, position = _get_string(data, position, name,
251282
as_class, tz_aware, uuid_subtype, compile_re)
252-
scope, position = _get_object(data, position,
283+
scope, position = _get_object(data, position, name,
253284
as_class, tz_aware, uuid_subtype, compile_re)
254285
return Code(code, scope), position
255286

256287

257-
def _get_null(data, position, as_class, tz_aware, uuid_subtype, compile_re):
288+
def _get_null(
289+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
258290
return None, position
259291

260292

261-
def _get_regex(data, position, as_class, tz_aware, uuid_subtype, compile_re):
293+
def _get_regex(
294+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
262295
pattern, position = _get_c_string(data, position)
263296
bson_flags, position = _get_c_string(data, position)
264297
bson_re = Regex(pattern, bson_flags)
@@ -268,21 +301,23 @@ def _get_regex(data, position, as_class, tz_aware, uuid_subtype, compile_re):
268301
return bson_re, position
269302

270303

271-
def _get_ref(data, position, as_class, tz_aware, uuid_subtype, compile_re):
272-
collection, position = _get_string(data, position, as_class, tz_aware,
273-
uuid_subtype, compile_re)
274-
oid, position = _get_oid(data, position)
304+
def _get_ref(
305+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
306+
collection, position = _get_string(
307+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re)
308+
oid, position = _get_oid(data, position, name)
275309
return DBRef(collection, oid), position
276310

277311

278312
def _get_timestamp(
279-
data, position, as_class, tz_aware, uuid_subtype, compile_re):
280-
inc, position = _get_int(data, position, unsigned=True)
281-
timestamp, position = _get_int(data, position, unsigned=True)
313+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
314+
inc, position = _get_int(data, position, name, unsigned=True)
315+
timestamp, position = _get_int(data, position, name, unsigned=True)
282316
return Timestamp(timestamp, inc), position
283317

284318

285-
def _get_long(data, position, as_class, tz_aware, uuid_subtype, compile_re):
319+
def _get_long(
320+
data, position, name, as_class, tz_aware, uuid_subtype, compile_re):
286321
# Have to cast to long; on 32-bit unpack may return an int.
287322
# 2to3 will change long to int. That's fine since long doesn't
288323
# exist in python3.
@@ -310,17 +345,21 @@ def _get_long(data, position, as_class, tz_aware, uuid_subtype, compile_re):
310345
BSONINT: _get_int, # number_int
311346
BSONTIM: _get_timestamp,
312347
BSONLON: _get_long, # Same as _get_int after 2to3 runs.
313-
BSONMIN: lambda u, v, w, x, y, z: (MinKey(), v),
314-
BSONMAX: lambda u, v, w, x, y, z: (MaxKey(), v)}
348+
BSONMIN: lambda t, u, v, w, x, y, z: (MinKey(), u),
349+
BSONMAX: lambda t, u, v, w, x, y, z: (MaxKey(), u)}
315350

316351

317352
def _element_to_dict(
318353
data, position, as_class, tz_aware, uuid_subtype, compile_re):
319354
element_type = data[position:position + 1]
320355
position += 1
321356
element_name, position = _get_c_string(data, position)
322-
value, position = _element_getter[element_type](
323-
data, position, as_class, tz_aware, uuid_subtype, compile_re)
357+
try:
358+
func = _element_getter[element_type]
359+
except KeyError:
360+
_raise_unknown_type(element_type, element_name)
361+
value, position = func(data, position, element_name,
362+
as_class, tz_aware, uuid_subtype, compile_re)
324363

325364
return element_name, value, position
326365

bson/_cbsonmodule.c

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,10 +1427,10 @@ static PyObject* _cbson_dict_to_bson(PyObject* self, PyObject* args) {
14271427
return result;
14281428
}
14291429

1430-
static PyObject* get_value(PyObject* self, const char* buffer, unsigned* position,
1431-
unsigned char type, unsigned max, PyObject* as_class,
1432-
unsigned char tz_aware, unsigned char uuid_subtype,
1433-
unsigned char compile_re) {
1430+
static PyObject* get_value(PyObject* self, PyObject* name, const char* buffer,
1431+
unsigned* position, unsigned char type, unsigned max,
1432+
PyObject* as_class, unsigned char tz_aware,
1433+
unsigned char uuid_subtype, unsigned char compile_re) {
14341434
struct module_state *state = GETSTATE(self);
14351435

14361436
PyObject* value = NULL;
@@ -1574,7 +1574,7 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio
15741574
Py_DECREF(value);
15751575
goto invalid;
15761576
}
1577-
to_append = get_value(self, buffer, position, bson_type,
1577+
to_append = get_value(self, name, buffer, position, bson_type,
15781578
max - (unsigned)key_size,
15791579
as_class, tz_aware, uuid_subtype,
15801580
compile_re);
@@ -2078,11 +2078,50 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio
20782078
}
20792079
default:
20802080
{
2081-
PyObject* InvalidDocument = _error("InvalidDocument");
2082-
if (InvalidDocument) {
2083-
PyErr_SetString(InvalidDocument,
2084-
"no c decoder for this type yet");
2085-
Py_DECREF(InvalidDocument);
2081+
PyObject* InvalidBSON = _error("InvalidBSON");
2082+
if (InvalidBSON) {
2083+
#if PY_MAJOR_VERSION >= 3
2084+
PyObject* type_obj = PyBytes_FromFormat("%c", type);
2085+
#else
2086+
PyObject* type_obj = PyString_FromFormat("%c", type);
2087+
#endif
2088+
if (type_obj) {
2089+
PyObject* type_repr = PyObject_Repr(type_obj);
2090+
Py_DECREF(type_obj);
2091+
if (type_repr) {
2092+
PyObject* errmsg = NULL;
2093+
#if PY_MAJOR_VERSION >= 3
2094+
PyObject* left = PyUnicode_FromString(
2095+
"Detected unknown BSON type ");
2096+
if (left) {
2097+
PyObject* lmsg = PyUnicode_Concat(left, type_repr);
2098+
Py_DECREF(left);
2099+
if (lmsg) {
2100+
errmsg = PyUnicode_FromFormat(
2101+
"%U for fieldname '%U'. Are you using the "
2102+
"latest driver version?", lmsg, name);
2103+
Py_DECREF(lmsg);
2104+
}
2105+
}
2106+
#else
2107+
PyObject* name_repr = PyObject_Repr(name);
2108+
if (name_repr) {
2109+
errmsg = PyString_FromFormat(
2110+
"Detected unknown BSON type %s for fieldname %s."
2111+
" Are you using the latest driver version?",
2112+
PyString_AS_STRING(type_repr),
2113+
PyString_AS_STRING(name_repr));
2114+
Py_DECREF(name_repr);
2115+
}
2116+
#endif
2117+
Py_DECREF(type_repr);
2118+
if (errmsg) {
2119+
PyErr_SetObject(InvalidBSON, errmsg);
2120+
Py_DECREF(errmsg);
2121+
}
2122+
}
2123+
}
2124+
Py_DECREF(InvalidBSON);
20862125
}
20872126
goto invalid;
20882127
}
@@ -2173,7 +2212,7 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string,
21732212
return NULL;
21742213
}
21752214
position += (unsigned)name_length + 1;
2176-
value = get_value(self, string, &position, type,
2215+
value = get_value(self, name, string, &position, type,
21772216
max - position, as_class, tz_aware, uuid_subtype,
21782217
compile_re);
21792218
if (!value) {

test/test_bson.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,24 @@ def encode_then_decode_backport_precedence(doc):
358358
qcheck.check_unittest(self, encode_then_decode_backport_precedence,
359359
qcheck.gen_mongo_dict(3))
360360

361+
def test_unknown_type(self):
362+
# Repr value differs with major python version
363+
part = "type %r for fieldname %r" % (b('\x13'), u"foo")
364+
docs = [
365+
b('\x0e\x00\x00\x00\x13foo\x00\x01\x00\x00\x00\x00'),
366+
b('\x16\x00\x00\x00\x04foo\x00\x0c\x00\x00\x00\x130'
367+
'\x00\x01\x00\x00\x00\x00\x00'),
368+
b(' \x00\x00\x00\x04bar\x00\x16\x00\x00\x00\x030\x00\x0e\x00\x00'
369+
'\x00\x13foo\x00\x01\x00\x00\x00\x00\x00\x00')]
370+
for bs in docs:
371+
try:
372+
bson.BSON(bs).decode()
373+
except Exception, exc:
374+
self.assertTrue(isinstance(exc, InvalidBSON))
375+
self.assertTrue(part in str(exc))
376+
else:
377+
self.fail("Failed to raise an exception.")
378+
361379
def test_dbpointer(self):
362380
# *Note* - DBPointer and DBRef are *not* the same thing. DBPointer
363381
# is a deprecated BSON type. DBRef is a convention that does not

0 commit comments

Comments
 (0)