From dcce27b394de59898480c9ed53cd890f995a93be Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sun, 21 Jan 2018 22:03:24 +0000 Subject: [PATCH 01/10] Initial commit of bulk reading into numpy arrays --- fiona/_vectorized.pyx | 116 +++++++++++++++++++++++++++++++++++++++ fiona/ogrext.pxd | 8 +++ fiona/ogrext.pyx | 7 --- fiona/ogrext1.pxd | 3 +- fiona/ogrext2.pxd | 3 +- requirements.txt | 1 + setup.py | 20 ++++++- tests/test_vectorized.py | 46 ++++++++++++++++ 8 files changed, 193 insertions(+), 11 deletions(-) create mode 100644 fiona/_vectorized.pyx create mode 100644 fiona/ogrext.pxd create mode 100644 tests/test_vectorized.py diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx new file mode 100644 index 000000000..e0032964d --- /dev/null +++ b/fiona/_vectorized.pyx @@ -0,0 +1,116 @@ +from .ogrext cimport Session, _deleteOgrFeature +from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE +from ._shim cimport * + +import logging +from six import text_type + +import numpy as np +cimport numpy as np + +log = logging.getLogger(__name__) + +def read_vectorized(collection): + cdef Session session + cdef void * cogr_feature + cdef void * cogr_geometry + cdef int num_fields + cdef void * fdefn + cdef int feature_index + cdef int field_index + cdef char * field_name_c + cdef bytes field_name_bytes + cdef str field_name + cdef int i + cdef long long [:] arr_int + cdef double [:] arr_double + cdef char * wkt + + session = collection.session + encoding = session._fileencoding + + if session.cogr_layer == NULL: + raise ValueError("Null layer") + + length = OGR_L_GetFeatureCount(session.cogr_layer, 0) + + data_geometry = np.empty([length], dtype=object) + data_properties = {} + + schema = session.get_schema() + for field_name, field_type in schema["properties"].items(): + field_type, precision = field_type.split(":") + if field_type == "int": + data_properties[field_name] = np.empty([length], dtype=np.int64) + elif field_type == "float": + data_properties[field_name] = np.empty([length], dtype=np.float64) + elif field_type == "str": + data_properties[field_name] = np.empty([length], dtype=object) + else: + # TODO: other types (dates, bytes, boolean subtype) + raise TypeError("Unexpected field type: {}".format(field_type)) + + for feature_index in range(length): + # TODO: this isn't the correct way to iterate over features + cogr_feature = OGR_L_GetFeature(session.cogr_layer, feature_index) + + num_fields = OGR_F_GetFieldCount(cogr_feature) + for field_index in range(num_fields): + fdefn = OGR_F_GetFieldDefnRef(cogr_feature, field_index) + + # field name + field_name_c = OGR_Fld_GetNameRef(fdefn) + field_name_bytes = field_name_c + field_name = field_name_bytes.decode(encoding) + + # field type + field_type_id = OGR_Fld_GetType(fdefn) + field_type_name = FIELD_TYPES[field_type_id] + field_type = FIELD_TYPES_MAP[field_type_name] + + if field_type is int: + arr_int = data_properties[field_name] + if is_field_null(cogr_feature, field_index): + # TODO: this isn't the correct way to handle NULL for ints + arr_int[feature_index] = 0 + else: + arr_int[feature_index] = OGR_F_GetFieldAsInteger64(cogr_feature, field_index) + elif field_type is float: + arr_double = data_properties[field_name] + if is_field_null(cogr_feature, field_index): + arr_double[feature_index] = np.nan + else: + arr_double[feature_index] = OGR_F_GetFieldAsDouble(cogr_feature, field_index) + elif field_type is text_type: + if is_field_null(cogr_feature, field_index): + value = None + else: + try: + value = OGR_F_GetFieldAsString(cogr_feature, field_index) + value = value.decode(encoding) + except UnicodeDecodeError: + log.warning( + "Failed to decode %s using %s codec", value, encoding) + arr = data_properties[field_name] + arr[feature_index] = value + else: + raise TypeError("Unexpected field type: {}".format(field_type)) + + # TODO: best way to return geometries for shapely? + cogr_geometry = OGR_F_GetGeometryRef(cogr_feature) + if cogr_geometry == NULL: + data_geometry[feature_index] = None + else: + result = OGR_G_ExportToWkt(cogr_geometry, &wkt) + if result != OGRERR_NONE: + raise ValueError("Failed to export geometry to WKT") + data_geometry[feature_index] = wkt + + _deleteOgrFeature(cogr_feature) + + features = { + "geometry": data_geometry, + "properties": data_properties, + } + + return features diff --git a/fiona/ogrext.pxd b/fiona/ogrext.pxd new file mode 100644 index 000000000..66adb5587 --- /dev/null +++ b/fiona/ogrext.pxd @@ -0,0 +1,8 @@ +cdef class Session: + cdef void *cogr_ds + cdef void *cogr_layer + cdef object _fileencoding + cdef object _encoding + cdef object collection + +cdef _deleteOgrFeature(void *cogr_feature) diff --git a/fiona/ogrext.pyx b/fiona/ogrext.pyx index 4e3dcd666..029e18482 100644 --- a/fiona/ogrext.pyx +++ b/fiona/ogrext.pyx @@ -397,13 +397,6 @@ def featureRT(feature, collection): # Collection-related extension classes and functions cdef class Session: - - cdef void *cogr_ds - cdef void *cogr_layer - cdef object _fileencoding - cdef object _encoding - cdef object collection - def __init__(self): self.cogr_ds = NULL self.cogr_layer = NULL diff --git a/fiona/ogrext1.pxd b/fiona/ogrext1.pxd index 305ed505d..fed22f236 100644 --- a/fiona/ogrext1.pxd +++ b/fiona/ogrext1.pxd @@ -135,7 +135,8 @@ cdef extern from "ogr_api.h": void * OGR_G_CreateGeometry (int wkbtypecode) void OGR_G_DestroyGeometry (void *geometry) unsigned char * OGR_G_ExportToJson (void *geometry) - void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt) int OGR_G_GetCoordinateDimension (void *geometry) int OGR_G_GetGeometryCount (void *geometry) unsigned char * OGR_G_GetGeometryName (void *geometry) diff --git a/fiona/ogrext2.pxd b/fiona/ogrext2.pxd index 9eecd832a..544375375 100644 --- a/fiona/ogrext2.pxd +++ b/fiona/ogrext2.pxd @@ -196,7 +196,8 @@ cdef extern from "ogr_api.h": void * OGR_G_CreateGeometry (int wkbtypecode) void OGR_G_DestroyGeometry (void *geometry) unsigned char * OGR_G_ExportToJson (void *geometry) - void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt) int OGR_G_GetCoordinateDimension (void *geometry) int OGR_G_GetGeometryCount (void *geometry) unsigned char * OGR_G_GetGeometryName (void *geometry) diff --git a/requirements.txt b/requirements.txt index 563bac9ce..f42d07625 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ cligj>=0.4 six>=1.7 ordereddict munch +numpy diff --git a/setup.py b/setup.py index 44ce8a513..b87116c7e 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,13 @@ from setuptools import setup from setuptools.extension import Extension +# NumPy is required for vectorized submodule +try: + import numpy as np +except ImportError: + has_numpy = False +else: + has_numpy = True # Use Cython if available. try: @@ -180,6 +187,9 @@ def run(self): libraries=libraries, extra_link_args=extra_link_args) +if has_numpy: + ext_options["include_dirs"] = [np.get_include()] + ext_options_cpp = ext_options.copy() # GDAL 2.3+ requires C++11 if sys.platform == "win32": @@ -212,14 +222,17 @@ def run(self): shutil.copy('fiona/_shim2.pyx', 'fiona/_shim.pyx') shutil.copy('fiona/_shim2.pxd', 'fiona/_shim.pxd') - ext_modules = cythonize([ + ext_modules = [ Extension('fiona._geometry', ['fiona/_geometry.pyx'], **ext_options), Extension('fiona._transform', ['fiona/_transform.pyx'], **ext_options_cpp), Extension('fiona._crs', ['fiona/_crs.pyx'], **ext_options), Extension('fiona._drivers', ['fiona/_drivers.pyx'], **ext_options), Extension('fiona._err', ['fiona/_err.pyx'], **ext_options), Extension('fiona._shim', ['fiona/_shim.pyx'], **ext_options), - Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)]) + Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)] + if has_numpy: + ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.pyx'], **ext_options)) + ext_modules = cythonize(ext_modules) # If there's no manifest template, as in an sdist, we just specify .c files. elif "clean" not in sys.argv: @@ -231,6 +244,9 @@ def run(self): Extension('fiona._err', ['fiona/_err.c'], **ext_options), Extension('fiona.ogrext', ['fiona/ogrext.c'], **ext_options), ] + if has_numpy: + ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.c'], **ext_options)) + ext_modules = cythonize(ext_modules) if gdal_major_version == 1: log.info("Building Fiona for gdal 1.x: {0}".format(gdalversion)) diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py new file mode 100644 index 000000000..a534baa63 --- /dev/null +++ b/tests/test_vectorized.py @@ -0,0 +1,46 @@ +import pytest +import fiona +from six import integer_types, string_types +try: + from fiona._vectorized import read_vectorized + has_vectorized = True +except ImportError: + has_vectorized = False + +if has_vectorized: + import numpy as np + from numpy.testing import assert_allclose + +requires_vectorized = pytest.mark.skipif(not has_vectorized, reason="Vectorized submodule not available") + +@requires_vectorized +def test_read_vectorized(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, "r") as collection: + features = read_vectorized(collection) + + assert len(features["geometry"]) == 67 + assert features["geometry"].dtype == object + assert features["geometry"][0].decode("ascii").startswith("POLYGON (") + assert features["geometry"][-1].decode("ascii").startswith("POLYGON (") + # TODO: better checks for geometry + + # check number of properties + assert len(features["properties"]) == len(collection.schema["properties"]) + + # float + assert features["properties"]["PERIMETER"].dtype == np.float64 + assert features["properties"]["PERIMETER"].shape == (67,) + assert_allclose(features["properties"]["PERIMETER"][0], 1.22107) + assert_allclose(features["properties"]["PERIMETER"][-1], 0.120627) + + # integer + assert features["properties"]["WILDRNP020"].dtype == np.int64 + assert features["properties"]["WILDRNP020"].shape == (67,) + assert features["properties"]["WILDRNP020"][0] == 332 + assert features["properties"]["WILDRNP020"][-1] == 511 + + # string + assert isinstance(features["properties"]["NAME"].dtype, object) + assert features["properties"]["NAME"].shape == (67,) + assert features["properties"]["NAME"][0] == "Mount Naomi Wilderness" + assert features["properties"]["NAME"][-1] == "Mesa Verde Wilderness" From b7aec85ddde8202dd3f9f46cb2078d622359f8b7 Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sun, 21 Jan 2018 22:54:23 +0000 Subject: [PATCH 02/10] Removed incorrect call to cythonize() --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index b87116c7e..0451368de 100644 --- a/setup.py +++ b/setup.py @@ -246,7 +246,6 @@ def run(self): ] if has_numpy: ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.c'], **ext_options)) - ext_modules = cythonize(ext_modules) if gdal_major_version == 1: log.info("Building Fiona for gdal 1.x: {0}".format(gdalversion)) From 4742cad034e6a5f171e0e9c1c8c738d6daca2029 Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Mon, 22 Jan 2018 12:50:10 +0000 Subject: [PATCH 03/10] Append NumPy include dir, not overwrite --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0451368de..274005a36 100644 --- a/setup.py +++ b/setup.py @@ -188,7 +188,7 @@ def run(self): extra_link_args=extra_link_args) if has_numpy: - ext_options["include_dirs"] = [np.get_include()] + ext_options["include_dirs"].append(np.get_include()) ext_options_cpp = ext_options.copy() # GDAL 2.3+ requires C++11 From 395c0e5b30a87a0851bd9160865eed1df35470c8 Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Mon, 22 Jan 2018 13:37:30 +0000 Subject: [PATCH 04/10] Removed type from field name --- fiona/_vectorized.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index e0032964d..fc271d23f 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -20,7 +20,6 @@ def read_vectorized(collection): cdef int field_index cdef char * field_name_c cdef bytes field_name_bytes - cdef str field_name cdef int i cdef long long [:] arr_int cdef double [:] arr_double From f8fecfcdf88c30912a92248d9cfe0cc890c2221f Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 13:47:44 +0000 Subject: [PATCH 05/10] Support ignore fields/geometry in read_vectorized --- fiona/_vectorized.pyx | 35 +++++++++++++++++++++++++---------- tests/test_vectorized.py | 24 +++++++++++++++++------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index fc271d23f..c0757bb59 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -33,11 +33,24 @@ def read_vectorized(collection): length = OGR_L_GetFeatureCount(session.cogr_layer, 0) - data_geometry = np.empty([length], dtype=object) data_properties = {} + if collection.ignore_fields: + ignore_fields = set(collection.ignore_fields) + else: + ignore_fields = set() + + if collection.ignore_geometry: + ignore_geometry = True + data_geometry = None + else: + ignore_geometry = False + data_geometry = np.empty([length], dtype=object) + schema = session.get_schema() for field_name, field_type in schema["properties"].items(): + if field_name in ignore_fields: + continue field_type, precision = field_type.split(":") if field_type == "int": data_properties[field_name] = np.empty([length], dtype=np.int64) @@ -61,6 +74,8 @@ def read_vectorized(collection): field_name_c = OGR_Fld_GetNameRef(fdefn) field_name_bytes = field_name_c field_name = field_name_bytes.decode(encoding) + if field_name in ignore_fields: + continue # field type field_type_id = OGR_Fld_GetType(fdefn) @@ -95,15 +110,15 @@ def read_vectorized(collection): else: raise TypeError("Unexpected field type: {}".format(field_type)) - # TODO: best way to return geometries for shapely? - cogr_geometry = OGR_F_GetGeometryRef(cogr_feature) - if cogr_geometry == NULL: - data_geometry[feature_index] = None - else: - result = OGR_G_ExportToWkt(cogr_geometry, &wkt) - if result != OGRERR_NONE: - raise ValueError("Failed to export geometry to WKT") - data_geometry[feature_index] = wkt + if not ignore_geometry: + cogr_geometry = OGR_F_GetGeometryRef(cogr_feature) + if cogr_geometry == NULL: + data_geometry[feature_index] = None + else: + result = OGR_G_ExportToWkt(cogr_geometry, &wkt) + if result != OGRERR_NONE: + raise ValueError("Failed to export geometry to WKT") + data_geometry[feature_index] = wkt _deleteOgrFeature(cogr_feature) diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py index a534baa63..5d62e9089 100644 --- a/tests/test_vectorized.py +++ b/tests/test_vectorized.py @@ -3,17 +3,12 @@ from six import integer_types, string_types try: from fiona._vectorized import read_vectorized - has_vectorized = True except ImportError: - has_vectorized = False - -if has_vectorized: + pytestmark = pytest.mark.skip +else: import numpy as np from numpy.testing import assert_allclose -requires_vectorized = pytest.mark.skipif(not has_vectorized, reason="Vectorized submodule not available") - -@requires_vectorized def test_read_vectorized(path_coutwildrnp_shp): with fiona.open(path_coutwildrnp_shp, "r") as collection: features = read_vectorized(collection) @@ -44,3 +39,18 @@ def test_read_vectorized(path_coutwildrnp_shp): assert features["properties"]["NAME"].shape == (67,) assert features["properties"]["NAME"][0] == "Mount Naomi Wilderness" assert features["properties"]["NAME"][-1] == "Mesa Verde Wilderness" + +def test_ignore_fields(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, ignore_fields=["NAME"]) as collection: + features = read_vectorized(collection) + + assert "PERIMETER" in features["properties"] + assert "WILDRNP020" in features["properties"] + assert "NAME" not in features["properties"] + + assert features["geometry"] is not None + +def test_ignore_geometry(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, ignore_geometry=True) as collection: + features = read_vectorized(collection) + assert features["geometry"] is None From 6784292af0d41e2d99ba2bfd9cdfb8868d21ffeb Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 13:48:36 +0000 Subject: [PATCH 06/10] Ignore fiona/_vectorized.c --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b16558592..f6847a21e 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,7 @@ fiona/_shim2.c fiona/_shim22.c fiona/_shim.pxd fiona/_shim.pyx +fiona/_vectorized.c tests/data/coutwildrnp.json tests/data/coutwildrnp.tar tests/data/coutwildrnp.zip From 58bcba91bf8e7bf9532a0f59e6b6f308fe1ab17c Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 14:30:13 +0000 Subject: [PATCH 07/10] Support binary fields in read_vectorized --- fiona/_vectorized.pyx | 27 ++++++++++++++++--- tests/test_binary_field.py | 55 ++++++++++++++++++++------------------ tests/test_vectorized.py | 16 +++++++++++ 3 files changed, 69 insertions(+), 29 deletions(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index c0757bb59..f2d4f8162 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -21,6 +21,8 @@ def read_vectorized(collection): cdef char * field_name_c cdef bytes field_name_bytes cdef int i + cdef int l + cdef long long fid cdef long long [:] arr_int cdef double [:] arr_double cdef char * wkt @@ -33,6 +35,7 @@ def read_vectorized(collection): length = OGR_L_GetFeatureCount(session.cogr_layer, 0) + data_fids = np.empty([length], dtype=object) data_properties = {} if collection.ignore_fields: @@ -51,20 +54,31 @@ def read_vectorized(collection): for field_name, field_type in schema["properties"].items(): if field_name in ignore_fields: continue - field_type, precision = field_type.split(":") + if ":" in field_type: + field_type, precision = field_type.split(":") + else: + precision = None if field_type == "int": data_properties[field_name] = np.empty([length], dtype=np.int64) elif field_type == "float": data_properties[field_name] = np.empty([length], dtype=np.float64) elif field_type == "str": data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "bytes": + data_properties[field_name] = np.empty([length], dtype=object) else: # TODO: other types (dates, bytes, boolean subtype) raise TypeError("Unexpected field type: {}".format(field_type)) + OGR_L_ResetReading(session.cogr_layer) for feature_index in range(length): - # TODO: this isn't the correct way to iterate over features - cogr_feature = OGR_L_GetFeature(session.cogr_layer, feature_index) + cogr_feature = OGR_L_GetNextFeature(session.cogr_layer) + + if cogr_feature == NULL: + raise ValueError("Failed to read feature {}".format(feature_index)) + + fid = OGR_F_GetFID(cogr_feature) + data_fids[feature_index] = str(fid) num_fields = OGR_F_GetFieldCount(cogr_feature) for field_index in range(num_fields): @@ -83,6 +97,7 @@ def read_vectorized(collection): field_type = FIELD_TYPES_MAP[field_type_name] if field_type is int: + # TODO: support boolean subtype arr_int = data_properties[field_name] if is_field_null(cogr_feature, field_index): # TODO: this isn't the correct way to handle NULL for ints @@ -107,6 +122,11 @@ def read_vectorized(collection): "Failed to decode %s using %s codec", value, encoding) arr = data_properties[field_name] arr[feature_index] = value + # TODO: support date dtype + elif field_type is bytes: + data = OGR_F_GetFieldAsBinary(cogr_feature, field_index, &l) + arr = data_properties[field_name] + arr[feature_index] = data[:l] else: raise TypeError("Unexpected field type: {}".format(field_type)) @@ -123,6 +143,7 @@ def read_vectorized(collection): _deleteOgrFeature(cogr_feature) features = { + "id": data_fids, "geometry": data_geometry, "properties": data_properties, } diff --git a/tests/test_binary_field.py b/tests/test_binary_field.py index fc4376675..6548435e6 100644 --- a/tests/test_binary_field.py +++ b/tests/test_binary_field.py @@ -9,6 +9,33 @@ from collections import OrderedDict from .conftest import requires_gpkg +def write_binary_gpkg(path): + meta = { + "driver": "GPKG", + "schema": { + "geometry": "Point", + "properties": OrderedDict([ + ("name", "str"), + ("data", "bytes"), + ]) + } + } + + # create some binary data to encode + data = binascii.a2b_hex(b"deadbeef") + + # write the binary data to a BLOB field + with fiona.open(path, "w", **meta) as dst: + feature = { + "geometry": {"type": "Point", "coordinates": ((0,0))}, + "properties": { + "name": "test", + "data": data + } + } + dst.write(feature) + + class TestBinaryField(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() @@ -18,33 +45,9 @@ def tearDown(self): @requires_gpkg def test_binary_field(self): - meta = { - "driver": "GPKG", - "schema": { - "geometry": "Point", - "properties": OrderedDict([ - ("name", "str"), - ("data", "bytes"), - ]) - } - } - - # create some binary data to encode - data = binascii.a2b_hex(b"deadbeef") - - # write the binary data to a BLOB field filename = os.path.join(self.tempdir, "binary_test.gpkg") - with fiona.open(filename, "w", **meta) as dst: - feature = { - "geometry": {"type": "Point", "coordinates": ((0,0))}, - "properties": { - "name": "test", - "data": data - } - } - dst.write(feature) - - del(data) + + write_binary_gpkg(filename) # read the data back and check consistency with fiona.open(filename, "r") as src: diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py index 5d62e9089..fdd8ef2f1 100644 --- a/tests/test_vectorized.py +++ b/tests/test_vectorized.py @@ -1,5 +1,6 @@ import pytest import fiona +import binascii from six import integer_types, string_types try: from fiona._vectorized import read_vectorized @@ -8,6 +9,8 @@ else: import numpy as np from numpy.testing import assert_allclose +from .conftest import requires_gpkg +from .test_binary_field import write_binary_gpkg def test_read_vectorized(path_coutwildrnp_shp): with fiona.open(path_coutwildrnp_shp, "r") as collection: @@ -54,3 +57,16 @@ def test_ignore_geometry(path_coutwildrnp_shp): with fiona.open(path_coutwildrnp_shp, ignore_geometry=True) as collection: features = read_vectorized(collection) assert features["geometry"] is None + +@requires_gpkg +def test_binary_field(tmpdir): + filename = str(tmpdir.join("test.gpkg")) + write_binary_gpkg(filename) + + with fiona.open(filename, "r") as collection: + print(collection.schema) + features = read_vectorized(collection) + + assert(features["properties"]["name"][0] == "test") + data = features["properties"]["data"][0] + assert(binascii.b2a_hex(data) == b"deadbeef") From b58ff97dac49144596a66f8ef5ff833ba0666421 Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 15:24:42 +0000 Subject: [PATCH 08/10] Support datetime fields in read_vectorized --- fiona/_vectorized.pyx | 30 +++++++++++++++++++++++++++++- tests/test_vectorized.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index f2d4f8162..ab31cb526 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -1,9 +1,11 @@ from .ogrext cimport Session, _deleteOgrFeature from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE from ._shim cimport * +from fiona.rfc3339 import FionaDateType, FionaDateTimeType, FionaTimeType import logging from six import text_type +import datetime import numpy as np cimport numpy as np @@ -22,6 +24,13 @@ def read_vectorized(collection): cdef bytes field_name_bytes cdef int i cdef int l + cdef int y = 0 + cdef int m = 0 + cdef int d = 0 + cdef int hh = 0 + cdef int mm = 0 + cdef int ss = 0 + cdef int tz = 0 cdef long long fid cdef long long [:] arr_int cdef double [:] arr_double @@ -66,6 +75,13 @@ def read_vectorized(collection): data_properties[field_name] = np.empty([length], dtype=object) elif field_type == "bytes": data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "date": + data_properties[field_name] = np.empty([length], dtype='datetime64[D]') + elif field_type == "time": + # numpy has no dtype for time without date + data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "datetime": + data_properties[field_name] = np.empty([length], dtype='datetime64[s]') else: # TODO: other types (dates, bytes, boolean subtype) raise TypeError("Unexpected field type: {}".format(field_type)) @@ -122,7 +138,19 @@ def read_vectorized(collection): "Failed to decode %s using %s codec", value, encoding) arr = data_properties[field_name] arr[feature_index] = value - # TODO: support date dtype + elif field_type in (FionaDateType, FionaTimeType, FionaDateTimeType): + arr = data_properties[field_name] + retval = OGR_F_GetFieldAsDateTime( + cogr_feature, field_index, &y, &m, &d, &hh, &mm, &ss, &tz) + if not retval: + arr[feature_index] = None + else: + if field_type is FionaDateType: + arr[feature_index] = datetime.date(y, m, d).isoformat() + elif field_type is FionaTimeType: + arr[feature_index] = datetime.time(hh, mm, ss).isoformat() + else: + arr[feature_index] = datetime.datetime(y, m, d, hh, mm, ss).isoformat() elif field_type is bytes: data = OGR_F_GetFieldAsBinary(cogr_feature, field_index, &l) arr = data_properties[field_name] diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py index fdd8ef2f1..17adf6053 100644 --- a/tests/test_vectorized.py +++ b/tests/test_vectorized.py @@ -70,3 +70,36 @@ def test_binary_field(tmpdir): assert(features["properties"]["name"][0] == "test") data = features["properties"]["data"][0] assert(binascii.b2a_hex(data) == b"deadbeef") + +@requires_gpkg # ESRI Shapefile doesn't support datetime fields +def test_datetime_fields(tmpdir): + filename = str(tmpdir.join("test.gpkg")) + schema = { + "geometry": "Point", + "properties": [ + ("date", "date"), + ("datetime", "datetime"), + ("nulldt", "datetime"), + ] + } + with fiona.open(filename, "w", driver="GPKG", schema=schema) as dst: + feature = { + "geometry": None, + "properties": { + "date": "2018-03-24", + "datetime": "2018-03-24T15:06:01", + "nulldt": None, + } + } + dst.write(feature) + + with fiona.open(filename, "r") as src: + features = read_vectorized(src) + + assert features["properties"]["date"].dtype.name == "datetime64[D]" + assert features["properties"]["datetime"].dtype.name == "datetime64[s]" + assert features["properties"]["nulldt"].dtype.name == "datetime64[s]" + + assert features["properties"]["date"][0] == np.datetime64("2018-03-24") + assert features["properties"]["datetime"][0] == np.datetime64("2018-03-24T15:06:01") + assert str(features["properties"]["nulldt"][0]) == "NaT" From 35a4308502e4760ec06e757dfb5c0d466526f14c Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 15:42:12 +0000 Subject: [PATCH 09/10] Comment tweaks --- fiona/_vectorized.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index ab31cb526..fdc86f170 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -83,7 +83,6 @@ def read_vectorized(collection): elif field_type == "datetime": data_properties[field_name] = np.empty([length], dtype='datetime64[s]') else: - # TODO: other types (dates, bytes, boolean subtype) raise TypeError("Unexpected field type: {}".format(field_type)) OGR_L_ResetReading(session.cogr_layer) @@ -116,7 +115,7 @@ def read_vectorized(collection): # TODO: support boolean subtype arr_int = data_properties[field_name] if is_field_null(cogr_feature, field_index): - # TODO: this isn't the correct way to handle NULL for ints + # TODO: is this the best way to handle NULL values for int? arr_int[feature_index] = 0 else: arr_int[feature_index] = OGR_F_GetFieldAsInteger64(cogr_feature, field_index) From dc4ab3d7ba472716933e8700e0ef6fb37b9bf8be Mon Sep 17 00:00:00 2001 From: Joshua Arnott Date: Sat, 24 Mar 2018 16:02:21 +0000 Subject: [PATCH 10/10] Support for WKB as geometry type in read_vectorized --- fiona/_vectorized.pyx | 12 +++++++++++- tests/test_vectorized.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx index fdc86f170..84fa37e02 100644 --- a/fiona/_vectorized.pyx +++ b/fiona/_vectorized.pyx @@ -1,6 +1,7 @@ from .ogrext cimport Session, _deleteOgrFeature from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE from ._shim cimport * +from libc.stdlib cimport malloc, free from fiona.rfc3339 import FionaDateType, FionaDateTimeType, FionaTimeType import logging @@ -12,7 +13,7 @@ cimport numpy as np log = logging.getLogger(__name__) -def read_vectorized(collection): +def read_vectorized(collection, use_wkb=False): cdef Session session cdef void * cogr_feature cdef void * cogr_geometry @@ -35,6 +36,7 @@ def read_vectorized(collection): cdef long long [:] arr_int cdef double [:] arr_double cdef char * wkt + cdef char * wkb session = collection.session encoding = session._fileencoding @@ -161,6 +163,14 @@ def read_vectorized(collection): cogr_geometry = OGR_F_GetGeometryRef(cogr_feature) if cogr_geometry == NULL: data_geometry[feature_index] = None + elif use_wkb: + length = OGR_G_WkbSize(cogr_geometry) + wkb = malloc(sizeof(char)*length) + result = OGR_G_ExportToWkb(cogr_geometry, 1, wkb) + if result != OGRERR_NONE: + raise ValueError("Failed to export geometry to WKB") + data_geometry[feature_index] = wkb[:length] + free(wkb) else: result = OGR_G_ExportToWkt(cogr_geometry, &wkt) if result != OGRERR_NONE: diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py index 17adf6053..248f724ea 100644 --- a/tests/test_vectorized.py +++ b/tests/test_vectorized.py @@ -103,3 +103,13 @@ def test_datetime_fields(tmpdir): assert features["properties"]["date"][0] == np.datetime64("2018-03-24") assert features["properties"]["datetime"][0] == np.datetime64("2018-03-24T15:06:01") assert str(features["properties"]["nulldt"][0]) == "NaT" + +def test_wkb(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, "r") as collection: + features = read_vectorized(collection, use_wkb=True) + + geometry = features["geometry"][0] + assert geometry[0:1] == b"\x01" # little endian + assert geometry[1:5] == b"\x03\x00\x00\x00" # polygon + assert geometry[5:9] == b"\x01\x00\x00\x00" # 1 ring + assert len(geometry) == 1325