Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4951e74
Refactor 'decimal' type to 'float'
luis11011 Oct 6, 2021
ce9d5e1
Remove decimal
luis11011 Oct 8, 2021
4b6a508
Fix float type inferring bugs
luis11011 Oct 11, 2021
599be72
Fix quality on user set data types bug
luis11011 Oct 11, 2021
bbd030d
Fix dask.distributed bug
luis11011 Oct 11, 2021
9c605d3
Fix partial profiling bug
luis11011 Oct 13, 2021
2d6970c
Fix sort bug on Pandas
luis11011 Oct 13, 2021
fe1a4b4
Include exclusive joins
luis11011 Oct 14, 2021
6c283fe
Fix local load bug on Pandas
luis11011 Oct 14, 2021
ce9f491
Adjust mode and percentile functions output formats
luis11011 Oct 15, 2021
b0ea696
Adjust parser function descriptions
luis11011 Oct 15, 2021
59a70b1
Allow dataframe values on fill_na
luis11011 Oct 15, 2021
485f9ea
Fix match mask parameter bug
luis11011 Oct 15, 2021
8f396e8
Fix tranformations bug on partial profiling
luis11011 Oct 18, 2021
e539130
Fix aggregation and group by operations
luis11011 Oct 18, 2021
0d8a63a
Fix fill_na eval value bug
luis11011 Oct 18, 2021
1768f4f
Fix load file name bug
luis11011 Oct 18, 2021
e18711c
Adjust sort
luis11011 Oct 20, 2021
75f6fd7
Fix match_positions function bugs
luis11011 Oct 22, 2021
4478595
Fix empty dataset profiling bug
luis11011 Oct 25, 2021
b828e6c
Default index_to_string and string_to_index behavior to replace
luis11011 Oct 28, 2021
f959515
Implement copy dataframe function
luis11011 Oct 28, 2021
1fa40d3
Fix copy Label Encoder instance
luis11011 Oct 28, 2021
1be2932
Fix docs url
argenisleon Oct 30, 2021
bb71a9d
Fix formatting
argenisleon Oct 30, 2021
7819fb5
Update dask from 2021.9.0 to 2021.10.0
pyup-bot Nov 1, 2021
e646e50
Update dask from 2021.4.0 to 2021.10.0
pyup-bot Nov 1, 2021
c4093bd
Update dask from 2021.9.0 to 2021.10.0
pyup-bot Nov 1, 2021
02e4c4f
Update dask from 2021.9.0 to 2021.10.0
pyup-bot Nov 1, 2021
0996a1c
Update tensorflow from 2.5.1 to 2.6.0
pyup-bot Nov 1, 2021
41b4173
Update tensorflow from 2.5.1 to 2.6.0
pyup-bot Nov 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Refactor 'decimal' type to 'float'
  • Loading branch information
luis11011 committed Oct 6, 2021
commit 4951e7470643f6da36b15a79b8b01890668995a9
2 changes: 1 addition & 1 deletion optimus/engines/base/basedataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,7 @@ def report(self, df, cols="*", buckets=MAX_BUCKETS, infer=False, relative_error=
"hist_hours": hist_hour, "hist_minutes": hist_minute}

elif col["column_data_type"] == "int" or col["column_data_type"] == "string" or col[
"column_data_type"] == "decimal":
"column_data_type"] == "float":
hist = plot_hist({col_name: hist_dict}, output="base64")
hist_pic = {"hist_numeric_string": hist}
if "frequency" in col:
Expand Down
2 changes: 1 addition & 1 deletion optimus/engines/base/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -3478,7 +3478,7 @@ def infer_type(self, cols="*", sample=INFER_PROFILER_ROWS, tidy=True) -> dict:
dtype_i = 0

if len(dtypes) > 1:
if dtypes[0] == ProfilerDataTypes.INT.value and dtypes[1] == ProfilerDataTypes.DECIMAL.value:
if dtypes[0] == ProfilerDataTypes.INT.value and dtypes[1] == ProfilerDataTypes.FLOAT.value:
dtype_i = 1

if dtypes[0] == ProfilerDataTypes.ZIP_CODE.value and dtypes[1] == ProfilerDataTypes.INT.value:
Expand Down
20 changes: 10 additions & 10 deletions optimus/engines/base/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class BaseConstants:

# inferred/input to internal
OPTIMUS_TO_INTERNAL = {ProfilerDataTypes.INT.value: "int",
ProfilerDataTypes.DECIMAL.value: "float",
ProfilerDataTypes.FLOAT.value: "float",
ProfilerDataTypes.STRING.value: "str",
ProfilerDataTypes.BOOL.value: "bool",
ProfilerDataTypes.BOOLEAN.value: "bool",
Expand Down Expand Up @@ -52,13 +52,13 @@ class BaseConstants:
"uint64": ProfilerDataTypes.INT.value,
"binary": ProfilerDataTypes.INT.value,
"large_binary": ProfilerDataTypes.INT.value,
"numeric": ProfilerDataTypes.DECIMAL.value,
"float": ProfilerDataTypes.DECIMAL.value,
"float16": ProfilerDataTypes.DECIMAL.value,
"float32": ProfilerDataTypes.DECIMAL.value,
"float64": ProfilerDataTypes.DECIMAL.value,
"float_": ProfilerDataTypes.DECIMAL.value,
"double": ProfilerDataTypes.DECIMAL.value,
"numeric": ProfilerDataTypes.FLOAT.value,
"float": ProfilerDataTypes.FLOAT.value,
"float16": ProfilerDataTypes.FLOAT.value,
"float32": ProfilerDataTypes.FLOAT.value,
"float64": ProfilerDataTypes.FLOAT.value,
"float_": ProfilerDataTypes.FLOAT.value,
"double": ProfilerDataTypes.FLOAT.value,
"bool_": ProfilerDataTypes.BOOL.value,
"date": ProfilerDataTypes.DATETIME.value,
"date32": ProfilerDataTypes.DATETIME.value,
Expand Down Expand Up @@ -105,7 +105,7 @@ def INT_INTERNAL_TYPES(self):

@property
def NUMERIC_INTERNAL_TYPES(self):
types = [ProfilerDataTypes.INT.value, ProfilerDataTypes.DECIMAL.value]
types = [ProfilerDataTypes.INT.value, ProfilerDataTypes.FLOAT.value]
return types + [item[0] for item in self.INTERNAL_TO_OPTIMUS.items() if item[1] in types]

@property
Expand All @@ -124,7 +124,7 @@ def INT_TYPES(self):

@property
def NUMERIC_TYPES(self):
types = [ProfilerDataTypes.INT.value, ProfilerDataTypes.DECIMAL.value]
types = [ProfilerDataTypes.INT.value, ProfilerDataTypes.FLOAT.value]
return self.ANY_TYPES + types +\
[item[0] for item in self.INTERNAL_TO_OPTIMUS.items() if item[1] in types]

Expand Down
6 changes: 3 additions & 3 deletions optimus/engines/base/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from optimus.helpers.constants import ProfilerDataTypes
from optimus.helpers.core import one_tuple_to_val, val_to_list
from optimus.infer import is_datetime_str, is_list, is_list_of_list, is_null, is_bool, \
is_credit_card_number, is_zip_code, is_decimal, is_datetime, is_valid_datetime_format, \
is_credit_card_number, is_zip_code, is_float, is_datetime, is_valid_datetime_format, \
is_object_value, is_ip, is_url, is_missing, is_gender, is_list_of_int, is_list_of_str, \
is_str, is_phone_number, is_int_like

Expand Down Expand Up @@ -876,8 +876,8 @@ def infer_data_types(self, value, cols_data_types):
dtype = ProfilerDataTypes.ZIP_CODE.value
elif is_int_like(value):
dtype = ProfilerDataTypes.INT.value
elif is_decimal(value):
dtype = ProfilerDataTypes.DECIMAL.value
elif is_float(value):
dtype = ProfilerDataTypes.FLOAT.value
elif is_datetime(value):
dtype = ProfilerDataTypes.DATETIME.value
elif is_missing(value):
Expand Down
2 changes: 1 addition & 1 deletion optimus/engines/spark/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class Constants(BaseConstants):
STRING_TYPES = ["str"]
ARRAY_TYPES = ["array"]

DTYPES_TO_INFERRED = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"],
DTYPES_TO_INFERRED = {"int": ["smallint", "tinyint", "bigint", "int"], "float": ["float", "double"],
"string": ["string"], "date": ["date", "timestamp"], "boolean": ["boolean"],
"binary": ["binary"],
"array": ["array"], "object": ["object"], "null": ["null"], "missing": ["missing"]}
2 changes: 1 addition & 1 deletion optimus/helpers/columns_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def hist_date():

if data_type is not None:
col_data_type = data_type[col_name]["data_type"]
if col_data_type == "int" or col_data_type == "decimal":
if col_data_type == "int" or col_data_type == "float":
exprs = hist_numeric(min_max, buckets)
elif col_data_type == "string":
exprs = hist_string(buckets)
Expand Down
6 changes: 3 additions & 3 deletions optimus/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class ProfilerDataTypesQuality(Enum):

class ProfilerDataTypes(Enum):
INT = "int"
DECIMAL = "decimal"
FLOAT = "float"
STRING = "str"
BOOL = "bool"
BOOLEAN = "boolean"
Expand Down Expand Up @@ -186,7 +186,7 @@ def list():
return list(map(lambda c: c.value, Schemas))


PROFILER_NUMERIC_DTYPES = [ProfilerDataTypes.INT.value, ProfilerDataTypes.DECIMAL.value]
PROFILER_NUMERIC_DTYPES = [ProfilerDataTypes.INT.value, ProfilerDataTypes.FLOAT.value]
PROFILER_STRING_DTYPES = [ProfilerDataTypes.STRING.value, ProfilerDataTypes.BOOLEAN.value,
ProfilerDataTypes.DATETIME.value, ProfilerDataTypes.ARRAY.value,
ProfilerDataTypes.OBJECT.value, ProfilerDataTypes.GENDER.value,
Expand Down Expand Up @@ -405,7 +405,7 @@ def print_check_point_config(filesystem):
PYTHON_TYPES = {"string": str, "int": int, "float": float, "boolean": bool}
PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"}
PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "float": "numeric",
"decimal": "numeric", "date": "date", "array": "array", "binary": "binary", "null": "null"}
"float": "numeric", "date": "date", "array": "array", "binary": "binary", "null": "null"}

PROFILER_CATEGORICAL_DTYPES = [ProfilerDataTypes.BOOL.value,
ProfilerDataTypes.BOOLEAN.value, ProfilerDataTypes.ZIP_CODE.value,
Expand Down
2 changes: 1 addition & 1 deletion optimus/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def is_dask_future(value):
return isinstance(value, distributed.client.Future)


def is_decimal(value):
def is_float(value):
return fastnumbers.isfloat(value, allow_nan=True)


Expand Down
2 changes: 1 addition & 1 deletion optimus/infer_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
"datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
}
SPARK_DTYPES_TO_INFERRED = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"],
SPARK_DTYPES_TO_INFERRED = {"int": ["smallint", "tinyint", "bigint", "int"], "float": ["float", "double"],
"string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary",
"array": "array", "object": "object", "null": "null", "missing": "missing"}
PYSPARK_NUMERIC_TYPES = ["byte", "short", "big", "int", "double", "float"]
Expand Down
2 changes: 1 addition & 1 deletion optimus/profiler/templates/one_column.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<div class="main">
<div class="info">

{% if data.column_type=="categorical" or data.column_type=="decimal" or data.column_type=="numeric" or data.column_type=="date" or
{% if data.column_type=="categorical" or data.column_type=="float" or data.column_type=="numeric" or data.column_type=="date" or
data.column_type=="bool" or data.column_type=="array" or data.column_type=="null" or data.column_type=="timestamp"%}

<div class="panel_profiler">
Expand Down
2 changes: 1 addition & 1 deletion optimus/profiler/templates/out/one_column.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<div class="main" style="width: 100%; overflow: auto; border-bottom: 1px solid #eeeeee; padding: 10px 0;">
<div class="info" style="overflow: auto;">

{% if data.column_type=="categorical" or data.column_type=="decimal" or data.column_type=="numeric" or data.column_type=="date" or
{% if data.column_type=="categorical" or data.column_type=="float" or data.column_type=="numeric" or data.column_type=="date" or
data.column_type=="bool" or data.column_type=="array" or data.column_type=="null" or data.column_type=="timestamp"%}

<div class="panel_profiler" style="margin-right: 2%; float: left; padding-bottom: 2%; font-family: monospace;">
Expand Down