diff --git a/petl/test/transform/test_normalize_timezone.py b/petl/test/transform/test_normalize_timezone.py new file mode 100644 index 00000000..7bb4566a --- /dev/null +++ b/petl/test/transform/test_normalize_timezone.py @@ -0,0 +1,27 @@ +import unittest +from petl.transform.normalize_timezone import normalize_timezone + +class TestNormalizeTimezone(unittest.TestCase): + + def test_basic_conversion(self): + input_data = [ + {'timestamp': '2023-12-01T10:00:00', 'timezone': 'America/New_York'}, + {'timestamp': '2023-12-01T15:00:00', 'timezone': 'Europe/London'} + ] + result = list(normalize_timezone(input_data)) + self.assertEqual(result[0]['timestamp_utc'], '2023-12-01T15:00:00+00:00') + self.assertEqual(result[1]['timestamp_utc'], '2023-12-01T15:00:00+00:00') + self.assertEqual(result[0]['timezone_original'], 'America/New_York') + + def test_invalid_timezone(self): + input_data = [{'timestamp': '2023-12-01T10:00:00', 'timezone': 'Invalid/Zone'}] + with self.assertRaises(ValueError): + list(normalize_timezone(input_data)) + + def test_missing_timestamp(self): + input_data = [{'timezone': 'UTC'}] + with self.assertRaises(ValueError): + list(normalize_timezone(input_data)) + +if __name__ == '__main__': + unittest.main() diff --git a/petl/transform/normalize_timezone.py b/petl/transform/normalize_timezone.py new file mode 100644 index 00000000..5bf173e8 --- /dev/null +++ b/petl/transform/normalize_timezone.py @@ -0,0 +1,38 @@ +from datetime import datetime +import pytz + +def normalize_timezone(table, timestamp_col='timestamp', tz_col='timezone'): + """ + Normalize timestamps to UTC while retaining original timezone. + + Args: + table: petl table (iterable of rows/dicts) + timestamp_col (str): column name with timestamp strings + tz_col (str): column name with timezone name (e.g., 'America/New_York') + + Yields: + Each row with two added fields: 'timestamp_utc' and 'timezone_original' + """ + for row in table: + try: + original_ts = row[timestamp_col] + original_tz = row[tz_col] + + # Parse the timestamp + naive_dt = datetime.fromisoformat(original_ts) + + # Attach original timezone + local_dt = pytz.timezone(original_tz).localize(naive_dt) + + # Convert to UTC + utc_dt = local_dt.astimezone(pytz.UTC) + + # Create a new row with original + new fields + new_row = dict(row) + new_row['timestamp_utc'] = utc_dt.isoformat() + new_row['timezone_original'] = original_tz + + yield new_row + + except Exception as e: + raise ValueError(f"Failed to normalize row {row} due to error: {e}")