Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix: Duplicate datetime index value problem resolved. weather_code used.
  • Loading branch information
Bahadir-Erdem committed Mar 3, 2025
commit cc6bd54196f6f211f90566fdabcda5773336ff7b
80 changes: 50 additions & 30 deletions week2/friday/you_do_4/notebooks/4-data-cleaning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
"metadata": {},
"outputs": [],
"source": [
"dam_address = \"../dataset/raw/dam_occupancy.csv\"\n",
"weather_address = \"../dataset/external/weather.csv\"\n",
"flood_address = \"../dataset/external/flood.csv\"\n",
"climate_change_address = \"../dataset/external/climate_change.csv\""
"dam_address = \"dataset/raw/dam_occupancy.csv\"\n",
"weather_address = \"dataset/external/weather.csv\"\n",
"flood_address = \"dataset/external/flood.csv\"\n",
"climate_change_address = \"dataset/external/climate_change.csv\""
]
},
{
Expand Down Expand Up @@ -71,7 +71,7 @@
}
],
"conversionMethod": "pd.DataFrame",
"ref": "356e9564-80f2-4e10-ac9c-83f9b127b10e",
"ref": "37ab2f17-431f-44a0-9c46-ac0672d2d9e2",
"rows": [
[
"0",
Expand Down Expand Up @@ -545,11 +545,10 @@
"outputs": [],
"source": [
"dam_df = (\n",
" dam_df\n",
" .pipe(lower_column_names)\n",
" dam_df.pipe(lower_column_names)\n",
" .assign(**{\"datetime\": lambda df: pd.to_datetime(df[\"date\"])})\n",
" .drop(columns=\"date\")\n",
")"
")\n"
]
},
{
Expand Down Expand Up @@ -680,7 +679,7 @@
}
],
"conversionMethod": "pd.DataFrame",
"ref": "60c6dd70-34a1-4cde-aa9d-bc199750372c",
"ref": "82add7ac-063f-441e-9e0e-731fce1e19c3",
"rows": [
[
"0",
Expand Down Expand Up @@ -939,7 +938,8 @@
" .assign(**{\n",
" \"datetime\": lambda df: pd.to_datetime(\n",
" pd.to_datetime(df[\"date\"]).dt.tz_localize(None).dt.date\n",
" )\n",
" ),\n",
" \"weather_code\": lambda df: df[\"weather_code\"].astype(\"int64\")\n",
" })\n",
" .drop(columns=\"date\")\n",
")"
Expand All @@ -961,8 +961,8 @@
},
{
"name": "weather_code",
"rawType": "float64",
"type": "float"
"rawType": "int64",
"type": "integer"
},
{
"name": "temperature_2m_max",
Expand Down Expand Up @@ -1066,11 +1066,11 @@
}
],
"conversionMethod": "pd.DataFrame",
"ref": "93ca9152-41f4-4ecb-aec7-f7514aa16d62",
"ref": "d0604ccf-2cb9-433d-a58b-b9e13917ef30",
"rows": [
[
"0",
"53.0",
"53",
"10.563",
"7.363",
"8.990084",
Expand All @@ -1094,7 +1094,7 @@
],
[
"1",
"51.0",
"51",
"7.813",
"4.713",
"6.660917",
Expand All @@ -1118,7 +1118,7 @@
],
[
"2",
"51.0",
"51",
"8.613",
"2.263",
"6.0317497",
Expand Down Expand Up @@ -1191,7 +1191,7 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>53.0</td>\n",
" <td>53</td>\n",
" <td>10.563</td>\n",
" <td>7.363</td>\n",
" <td>8.990084</td>\n",
Expand All @@ -1215,7 +1215,7 @@
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51.0</td>\n",
" <td>51</td>\n",
" <td>7.813</td>\n",
" <td>4.713</td>\n",
" <td>6.660917</td>\n",
Expand All @@ -1239,7 +1239,7 @@
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51.0</td>\n",
" <td>51</td>\n",
" <td>8.613</td>\n",
" <td>2.263</td>\n",
" <td>6.031750</td>\n",
Expand Down Expand Up @@ -1268,9 +1268,9 @@
],
"text/plain": [
" weather_code temperature_2m_max temperature_2m_min temperature_2m_mean \\\n",
"0 53.0 10.563 7.363 8.990084 \n",
"1 51.0 7.813 4.713 6.660917 \n",
"2 51.0 8.613 2.263 6.031750 \n",
"0 53 10.563 7.363 8.990084 \n",
"1 51 7.813 4.713 6.660917 \n",
"2 51 8.613 2.263 6.031750 \n",
"\n",
" apparent_temperature_max apparent_temperature_min \\\n",
"0 9.071384 4.146955 \n",
Expand Down Expand Up @@ -1323,7 +1323,7 @@
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 weather_code 5941 non-null float64 \n",
" 0 weather_code 5941 non-null int64 \n",
" 1 temperature_2m_max 5941 non-null float64 \n",
" 2 temperature_2m_min 5941 non-null float64 \n",
" 3 temperature_2m_mean 5941 non-null float64 \n",
Expand All @@ -1344,7 +1344,7 @@
" 18 shortwave_radiation_sum 5941 non-null float64 \n",
" 19 et0_fao_evapotranspiration 5941 non-null float64 \n",
" 20 datetime 5941 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](1), float64(18), int64(2)\n",
"dtypes: datetime64[ns](1), float64(17), int64(3)\n",
"memory usage: 974.8 KB\n"
]
}
Expand Down Expand Up @@ -1386,7 +1386,7 @@
}
],
"conversionMethod": "pd.DataFrame",
"ref": "4a3af9fd-98ed-41de-8b38-4d0c75b937c5",
"ref": "d71d8bec-a9d8-4cb3-9c09-c53fe6845598",
"rows": [
[
"0",
Expand Down Expand Up @@ -1633,7 +1633,7 @@
}
],
"conversionMethod": "pd.DataFrame",
"ref": "5e06724a-289b-47c8-b9ea-5a730a82a75c",
"ref": "a9bb947c-e9ba-4433-8689-0ef465d1c5b0",
"rows": [
[
"0",
Expand Down Expand Up @@ -2114,6 +2114,16 @@
"cli_change_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def drop_duplicate_datetime_values(df: pd.DataFrame) -> pd.DataFrame:\n",
" return df.loc[df[\"datetime\"].drop_duplicates().index, :]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -2123,7 +2133,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -2134,20 +2144,30 @@
" .merge(cli_change_df, how=\"left\", on=\"datetime\")\n",
" .dropna()\n",
" .drop_duplicates()\n",
" .pipe(drop_duplicate_datetime_values)\n",
" .drop(columns=[\"sunrise\", \"sunset\"]) # constant value columns\n",
" .drop(columns=\"snowfall_sum\") # 96% of values are 0\n",
" .drop(columns=\"general_dam_reserved_water\") # just another target column\n",
" .drop(columns=\"weather_code\") # Might be used for target encoding later, currently not included.\n",
" .assign(weather_code=lambda df: df[\"weather_code\"].astype(\"int64\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"final_df = final_df.set_index(\"datetime\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"final_df.to_csv(\"../dataset/interim/past_dataset.csv\", index=False)"
"final_df.to_csv(\"dataset/interim/past_dataset.csv\")"
]
}
],
Expand Down