diff --git a/.gitignore b/.gitignore index b16558592..f6847a21e 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,7 @@ fiona/_shim2.c fiona/_shim22.c fiona/_shim.pxd fiona/_shim.pyx +fiona/_vectorized.c tests/data/coutwildrnp.json tests/data/coutwildrnp.tar tests/data/coutwildrnp.zip diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx new file mode 100644 index 000000000..84fa37e02 --- /dev/null +++ b/fiona/_vectorized.pyx @@ -0,0 +1,188 @@ +from .ogrext cimport Session, _deleteOgrFeature +from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE +from ._shim cimport * +from libc.stdlib cimport malloc, free +from fiona.rfc3339 import FionaDateType, FionaDateTimeType, FionaTimeType + +import logging +from six import text_type +import datetime + +import numpy as np +cimport numpy as np + +log = logging.getLogger(__name__) + +def read_vectorized(collection, use_wkb=False): + cdef Session session + cdef void * cogr_feature + cdef void * cogr_geometry + cdef int num_fields + cdef void * fdefn + cdef int feature_index + cdef int field_index + cdef char * field_name_c + cdef bytes field_name_bytes + cdef int i + cdef int l + cdef int y = 0 + cdef int m = 0 + cdef int d = 0 + cdef int hh = 0 + cdef int mm = 0 + cdef int ss = 0 + cdef int tz = 0 + cdef long long fid + cdef long long [:] arr_int + cdef double [:] arr_double + cdef char * wkt + cdef char * wkb + + session = collection.session + encoding = session._fileencoding + + if session.cogr_layer == NULL: + raise ValueError("Null layer") + + length = OGR_L_GetFeatureCount(session.cogr_layer, 0) + + data_fids = np.empty([length], dtype=object) + data_properties = {} + + if collection.ignore_fields: + ignore_fields = set(collection.ignore_fields) + else: + ignore_fields = set() + + if collection.ignore_geometry: + ignore_geometry = True + data_geometry = None + else: + ignore_geometry = False + data_geometry = np.empty([length], dtype=object) + + schema = session.get_schema() + for field_name, field_type in schema["properties"].items(): + if field_name in ignore_fields: + continue + if ":" in field_type: + field_type, precision = field_type.split(":") + else: + precision = None + if field_type == "int": + data_properties[field_name] = np.empty([length], dtype=np.int64) + elif field_type == "float": + data_properties[field_name] = np.empty([length], dtype=np.float64) + elif field_type == "str": + data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "bytes": + data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "date": + data_properties[field_name] = np.empty([length], dtype='datetime64[D]') + elif field_type == "time": + # numpy has no dtype for time without date + data_properties[field_name] = np.empty([length], dtype=object) + elif field_type == "datetime": + data_properties[field_name] = np.empty([length], dtype='datetime64[s]') + else: + raise TypeError("Unexpected field type: {}".format(field_type)) + + OGR_L_ResetReading(session.cogr_layer) + for feature_index in range(length): + cogr_feature = OGR_L_GetNextFeature(session.cogr_layer) + + if cogr_feature == NULL: + raise ValueError("Failed to read feature {}".format(feature_index)) + + fid = OGR_F_GetFID(cogr_feature) + data_fids[feature_index] = str(fid) + + num_fields = OGR_F_GetFieldCount(cogr_feature) + for field_index in range(num_fields): + fdefn = OGR_F_GetFieldDefnRef(cogr_feature, field_index) + + # field name + field_name_c = OGR_Fld_GetNameRef(fdefn) + field_name_bytes = field_name_c + field_name = field_name_bytes.decode(encoding) + if field_name in ignore_fields: + continue + + # field type + field_type_id = OGR_Fld_GetType(fdefn) + field_type_name = FIELD_TYPES[field_type_id] + field_type = FIELD_TYPES_MAP[field_type_name] + + if field_type is int: + # TODO: support boolean subtype + arr_int = data_properties[field_name] + if is_field_null(cogr_feature, field_index): + # TODO: is this the best way to handle NULL values for int? + arr_int[feature_index] = 0 + else: + arr_int[feature_index] = OGR_F_GetFieldAsInteger64(cogr_feature, field_index) + elif field_type is float: + arr_double = data_properties[field_name] + if is_field_null(cogr_feature, field_index): + arr_double[feature_index] = np.nan + else: + arr_double[feature_index] = OGR_F_GetFieldAsDouble(cogr_feature, field_index) + elif field_type is text_type: + if is_field_null(cogr_feature, field_index): + value = None + else: + try: + value = OGR_F_GetFieldAsString(cogr_feature, field_index) + value = value.decode(encoding) + except UnicodeDecodeError: + log.warning( + "Failed to decode %s using %s codec", value, encoding) + arr = data_properties[field_name] + arr[feature_index] = value + elif field_type in (FionaDateType, FionaTimeType, FionaDateTimeType): + arr = data_properties[field_name] + retval = OGR_F_GetFieldAsDateTime( + cogr_feature, field_index, &y, &m, &d, &hh, &mm, &ss, &tz) + if not retval: + arr[feature_index] = None + else: + if field_type is FionaDateType: + arr[feature_index] = datetime.date(y, m, d).isoformat() + elif field_type is FionaTimeType: + arr[feature_index] = datetime.time(hh, mm, ss).isoformat() + else: + arr[feature_index] = datetime.datetime(y, m, d, hh, mm, ss).isoformat() + elif field_type is bytes: + data = OGR_F_GetFieldAsBinary(cogr_feature, field_index, &l) + arr = data_properties[field_name] + arr[feature_index] = data[:l] + else: + raise TypeError("Unexpected field type: {}".format(field_type)) + + if not ignore_geometry: + cogr_geometry = OGR_F_GetGeometryRef(cogr_feature) + if cogr_geometry == NULL: + data_geometry[feature_index] = None + elif use_wkb: + length = OGR_G_WkbSize(cogr_geometry) + wkb = malloc(sizeof(char)*length) + result = OGR_G_ExportToWkb(cogr_geometry, 1, wkb) + if result != OGRERR_NONE: + raise ValueError("Failed to export geometry to WKB") + data_geometry[feature_index] = wkb[:length] + free(wkb) + else: + result = OGR_G_ExportToWkt(cogr_geometry, &wkt) + if result != OGRERR_NONE: + raise ValueError("Failed to export geometry to WKT") + data_geometry[feature_index] = wkt + + _deleteOgrFeature(cogr_feature) + + features = { + "id": data_fids, + "geometry": data_geometry, + "properties": data_properties, + } + + return features diff --git a/fiona/ogrext.pxd b/fiona/ogrext.pxd new file mode 100644 index 000000000..66adb5587 --- /dev/null +++ b/fiona/ogrext.pxd @@ -0,0 +1,8 @@ +cdef class Session: + cdef void *cogr_ds + cdef void *cogr_layer + cdef object _fileencoding + cdef object _encoding + cdef object collection + +cdef _deleteOgrFeature(void *cogr_feature) diff --git a/fiona/ogrext.pyx b/fiona/ogrext.pyx index 4e3dcd666..029e18482 100644 --- a/fiona/ogrext.pyx +++ b/fiona/ogrext.pyx @@ -397,13 +397,6 @@ def featureRT(feature, collection): # Collection-related extension classes and functions cdef class Session: - - cdef void *cogr_ds - cdef void *cogr_layer - cdef object _fileencoding - cdef object _encoding - cdef object collection - def __init__(self): self.cogr_ds = NULL self.cogr_layer = NULL diff --git a/fiona/ogrext1.pxd b/fiona/ogrext1.pxd index 305ed505d..fed22f236 100644 --- a/fiona/ogrext1.pxd +++ b/fiona/ogrext1.pxd @@ -135,7 +135,8 @@ cdef extern from "ogr_api.h": void * OGR_G_CreateGeometry (int wkbtypecode) void OGR_G_DestroyGeometry (void *geometry) unsigned char * OGR_G_ExportToJson (void *geometry) - void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt) int OGR_G_GetCoordinateDimension (void *geometry) int OGR_G_GetGeometryCount (void *geometry) unsigned char * OGR_G_GetGeometryName (void *geometry) diff --git a/fiona/ogrext2.pxd b/fiona/ogrext2.pxd index 9eecd832a..544375375 100644 --- a/fiona/ogrext2.pxd +++ b/fiona/ogrext2.pxd @@ -196,7 +196,8 @@ cdef extern from "ogr_api.h": void * OGR_G_CreateGeometry (int wkbtypecode) void OGR_G_DestroyGeometry (void *geometry) unsigned char * OGR_G_ExportToJson (void *geometry) - void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer) + OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt) int OGR_G_GetCoordinateDimension (void *geometry) int OGR_G_GetGeometryCount (void *geometry) unsigned char * OGR_G_GetGeometryName (void *geometry) diff --git a/requirements.txt b/requirements.txt index 563bac9ce..f42d07625 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ cligj>=0.4 six>=1.7 ordereddict munch +numpy diff --git a/setup.py b/setup.py index 44ce8a513..274005a36 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,13 @@ from setuptools import setup from setuptools.extension import Extension +# NumPy is required for vectorized submodule +try: + import numpy as np +except ImportError: + has_numpy = False +else: + has_numpy = True # Use Cython if available. try: @@ -180,6 +187,9 @@ def run(self): libraries=libraries, extra_link_args=extra_link_args) +if has_numpy: + ext_options["include_dirs"].append(np.get_include()) + ext_options_cpp = ext_options.copy() # GDAL 2.3+ requires C++11 if sys.platform == "win32": @@ -212,14 +222,17 @@ def run(self): shutil.copy('fiona/_shim2.pyx', 'fiona/_shim.pyx') shutil.copy('fiona/_shim2.pxd', 'fiona/_shim.pxd') - ext_modules = cythonize([ + ext_modules = [ Extension('fiona._geometry', ['fiona/_geometry.pyx'], **ext_options), Extension('fiona._transform', ['fiona/_transform.pyx'], **ext_options_cpp), Extension('fiona._crs', ['fiona/_crs.pyx'], **ext_options), Extension('fiona._drivers', ['fiona/_drivers.pyx'], **ext_options), Extension('fiona._err', ['fiona/_err.pyx'], **ext_options), Extension('fiona._shim', ['fiona/_shim.pyx'], **ext_options), - Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)]) + Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)] + if has_numpy: + ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.pyx'], **ext_options)) + ext_modules = cythonize(ext_modules) # If there's no manifest template, as in an sdist, we just specify .c files. elif "clean" not in sys.argv: @@ -231,6 +244,8 @@ def run(self): Extension('fiona._err', ['fiona/_err.c'], **ext_options), Extension('fiona.ogrext', ['fiona/ogrext.c'], **ext_options), ] + if has_numpy: + ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.c'], **ext_options)) if gdal_major_version == 1: log.info("Building Fiona for gdal 1.x: {0}".format(gdalversion)) diff --git a/tests/test_binary_field.py b/tests/test_binary_field.py index fc4376675..6548435e6 100644 --- a/tests/test_binary_field.py +++ b/tests/test_binary_field.py @@ -9,6 +9,33 @@ from collections import OrderedDict from .conftest import requires_gpkg +def write_binary_gpkg(path): + meta = { + "driver": "GPKG", + "schema": { + "geometry": "Point", + "properties": OrderedDict([ + ("name", "str"), + ("data", "bytes"), + ]) + } + } + + # create some binary data to encode + data = binascii.a2b_hex(b"deadbeef") + + # write the binary data to a BLOB field + with fiona.open(path, "w", **meta) as dst: + feature = { + "geometry": {"type": "Point", "coordinates": ((0,0))}, + "properties": { + "name": "test", + "data": data + } + } + dst.write(feature) + + class TestBinaryField(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() @@ -18,33 +45,9 @@ def tearDown(self): @requires_gpkg def test_binary_field(self): - meta = { - "driver": "GPKG", - "schema": { - "geometry": "Point", - "properties": OrderedDict([ - ("name", "str"), - ("data", "bytes"), - ]) - } - } - - # create some binary data to encode - data = binascii.a2b_hex(b"deadbeef") - - # write the binary data to a BLOB field filename = os.path.join(self.tempdir, "binary_test.gpkg") - with fiona.open(filename, "w", **meta) as dst: - feature = { - "geometry": {"type": "Point", "coordinates": ((0,0))}, - "properties": { - "name": "test", - "data": data - } - } - dst.write(feature) - - del(data) + + write_binary_gpkg(filename) # read the data back and check consistency with fiona.open(filename, "r") as src: diff --git a/tests/test_vectorized.py b/tests/test_vectorized.py new file mode 100644 index 000000000..248f724ea --- /dev/null +++ b/tests/test_vectorized.py @@ -0,0 +1,115 @@ +import pytest +import fiona +import binascii +from six import integer_types, string_types +try: + from fiona._vectorized import read_vectorized +except ImportError: + pytestmark = pytest.mark.skip +else: + import numpy as np + from numpy.testing import assert_allclose +from .conftest import requires_gpkg +from .test_binary_field import write_binary_gpkg + +def test_read_vectorized(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, "r") as collection: + features = read_vectorized(collection) + + assert len(features["geometry"]) == 67 + assert features["geometry"].dtype == object + assert features["geometry"][0].decode("ascii").startswith("POLYGON (") + assert features["geometry"][-1].decode("ascii").startswith("POLYGON (") + # TODO: better checks for geometry + + # check number of properties + assert len(features["properties"]) == len(collection.schema["properties"]) + + # float + assert features["properties"]["PERIMETER"].dtype == np.float64 + assert features["properties"]["PERIMETER"].shape == (67,) + assert_allclose(features["properties"]["PERIMETER"][0], 1.22107) + assert_allclose(features["properties"]["PERIMETER"][-1], 0.120627) + + # integer + assert features["properties"]["WILDRNP020"].dtype == np.int64 + assert features["properties"]["WILDRNP020"].shape == (67,) + assert features["properties"]["WILDRNP020"][0] == 332 + assert features["properties"]["WILDRNP020"][-1] == 511 + + # string + assert isinstance(features["properties"]["NAME"].dtype, object) + assert features["properties"]["NAME"].shape == (67,) + assert features["properties"]["NAME"][0] == "Mount Naomi Wilderness" + assert features["properties"]["NAME"][-1] == "Mesa Verde Wilderness" + +def test_ignore_fields(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, ignore_fields=["NAME"]) as collection: + features = read_vectorized(collection) + + assert "PERIMETER" in features["properties"] + assert "WILDRNP020" in features["properties"] + assert "NAME" not in features["properties"] + + assert features["geometry"] is not None + +def test_ignore_geometry(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, ignore_geometry=True) as collection: + features = read_vectorized(collection) + assert features["geometry"] is None + +@requires_gpkg +def test_binary_field(tmpdir): + filename = str(tmpdir.join("test.gpkg")) + write_binary_gpkg(filename) + + with fiona.open(filename, "r") as collection: + print(collection.schema) + features = read_vectorized(collection) + + assert(features["properties"]["name"][0] == "test") + data = features["properties"]["data"][0] + assert(binascii.b2a_hex(data) == b"deadbeef") + +@requires_gpkg # ESRI Shapefile doesn't support datetime fields +def test_datetime_fields(tmpdir): + filename = str(tmpdir.join("test.gpkg")) + schema = { + "geometry": "Point", + "properties": [ + ("date", "date"), + ("datetime", "datetime"), + ("nulldt", "datetime"), + ] + } + with fiona.open(filename, "w", driver="GPKG", schema=schema) as dst: + feature = { + "geometry": None, + "properties": { + "date": "2018-03-24", + "datetime": "2018-03-24T15:06:01", + "nulldt": None, + } + } + dst.write(feature) + + with fiona.open(filename, "r") as src: + features = read_vectorized(src) + + assert features["properties"]["date"].dtype.name == "datetime64[D]" + assert features["properties"]["datetime"].dtype.name == "datetime64[s]" + assert features["properties"]["nulldt"].dtype.name == "datetime64[s]" + + assert features["properties"]["date"][0] == np.datetime64("2018-03-24") + assert features["properties"]["datetime"][0] == np.datetime64("2018-03-24T15:06:01") + assert str(features["properties"]["nulldt"][0]) == "NaT" + +def test_wkb(path_coutwildrnp_shp): + with fiona.open(path_coutwildrnp_shp, "r") as collection: + features = read_vectorized(collection, use_wkb=True) + + geometry = features["geometry"][0] + assert geometry[0:1] == b"\x01" # little endian + assert geometry[1:5] == b"\x03\x00\x00\x00" # polygon + assert geometry[5:9] == b"\x01\x00\x00\x00" # 1 ring + assert len(geometry) == 1325