Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7490ee4
Add shared forecast pipeline utilities and tests
SamuelBrand1 Dec 11, 2025
1641e59
change to relative imports
SamuelBrand1 Dec 11, 2025
2e434ea
add Parquet dep
SamuelBrand1 Dec 11, 2025
fca72b0
reduce docstring bloat
SamuelBrand1 Dec 11, 2025
f89673e
Add PipelineOutput support for pipeline forecasts
SamuelBrand1 Dec 11, 2025
0cbc8fd
Add DEFAULT_TARGET_LETTER and update output filenames
SamuelBrand1 Dec 11, 2025
5156147
move utils and rename paths dataclass
SamuelBrand1 Dec 12, 2025
4401bef
Add use_percentage flag to EpiAutoGPInput and output logic
SamuelBrand1 Dec 12, 2025
44fa0af
Refactor EpiAutoGP pipeline and add end-to-end tests
SamuelBrand1 Dec 12, 2025
966f4d6
Update .gitignore
SamuelBrand1 Dec 12, 2025
02591bb
Refactor EpiAutoGP post-processing into utility function
SamuelBrand1 Dec 12, 2025
9b46362
Refactor forecast utils to use context methods
SamuelBrand1 Dec 12, 2025
ada1b74
Update README.md
SamuelBrand1 Dec 12, 2025
6cef44d
Add frequency to input and generalize forecast horizon
SamuelBrand1 Dec 12, 2025
74cbe8d
Add ed_visit_type to input and output handling
SamuelBrand1 Dec 12, 2025
b693b8f
Add ed_visit_type param for NSSP/ED visit modeling
SamuelBrand1 Dec 12, 2025
2d258cb
Add daily NSSP forecast tests and support for ED visit type
SamuelBrand1 Dec 12, 2025
6a4426e
Refactor forecast utils tests and remove prep_epiautogp tests
SamuelBrand1 Dec 12, 2025
ea73cd8
update epiautogp docstrings
SamuelBrand1 Dec 15, 2025
511cf26
Update prep_epiautogp_data.py
SamuelBrand1 Dec 15, 2025
0dd0f9f
Update output.jl
SamuelBrand1 Dec 15, 2025
ae9313e
add nhsn test coverage
SamuelBrand1 Dec 15, 2025
e16f115
reorg unit tests
SamuelBrand1 Dec 15, 2025
65e5d2d
Update pipelines/epiautogp/process_epiautogp_forecast.py
SamuelBrand1 Dec 15, 2025
551543a
caught anti-pattern
SamuelBrand1 Dec 15, 2025
6bb924c
Merge branch '780-add-forecast_epiautogp-function' of https://github.…
SamuelBrand1 Dec 15, 2025
bc5dbce
explain use of percentage
SamuelBrand1 Dec 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor EpiAutoGP pipeline and add end-to-end tests
Renamed forecast_utils.py to epiautogp_forecast_utils.py and updated all imports accordingly. Refactored the EpiAutoGP pipeline to use a context object for configuration, streamlined argument passing, and improved modularity. Added a new R plotting script (plot_epiautogp_forecast.R) for EpiAutoGP outputs. Introduced end-to-end and fit test shell scripts for automated testing. Removed obsolete prep test scripts. Updated process_epiautogp_forecast.py to simplify output processing and match R plotting expectations.
  • Loading branch information
SamuelBrand1 committed Dec 12, 2025
commit 44fa0af69ff39e2b2b2f8ff5ba4d6c6fab30e23e
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class ModelPaths:
@dataclass
class ForecastPipelineContext:
"""
Container for common forecast pipeline data and configurations.
Container for common forecast pipeline data, input configurations and
the logger.

This class holds all the shared state that gets passed around during
a forecast pipeline run, reducing the number of parameters that need
Expand All @@ -55,6 +56,12 @@ class ForecastPipelineContext:

disease: str
loc: str
target: str
frequency: str
use_percentage: bool
model_name: str
eval_data_path: Path | None
nhsn_data_path: Path | None
report_date: date
first_training_date: date
last_training_date: date
Expand All @@ -72,6 +79,12 @@ def setup_forecast_pipeline(
disease: str,
report_date: str,
loc: str,
target: str,
frequency: str,
use_percentage: bool,
model_name: str,
eval_data_path: Path | None,
nhsn_data_path: Path | None,
facility_level_nssp_data_dir: Path | str,
state_level_nssp_data_dir: Path | str,
output_dir: Path | str,
Expand Down Expand Up @@ -181,6 +194,12 @@ def setup_forecast_pipeline(
return ForecastPipelineContext(
disease=disease,
loc=loc,
target=target,
frequency=frequency,
use_percentage=use_percentage,
model_name=model_name,
eval_data_path=eval_data_path,
nhsn_data_path=nhsn_data_path,
report_date=report_date_parsed,
first_training_date=first_training_date,
last_training_date=last_training_date,
Expand All @@ -197,10 +216,6 @@ def setup_forecast_pipeline(

def prepare_model_data(
context: ForecastPipelineContext,
model_name: str,
eval_data_path: Path = None,
nhsn_data_path: Path = None,
loc_level_nwss_data: pl.DataFrame = None,
) -> ModelPaths:
"""
Prepare training and evaluation data for a model.
Expand Down Expand Up @@ -238,7 +253,7 @@ def prepare_model_data(
logger = context.logger

# Create model output directory
model_output_dir = Path(context.model_run_dir, model_name)
model_output_dir = Path(context.model_run_dir, context.model_name)
data_dir = Path(model_output_dir, "data")
os.makedirs(data_dir, exist_ok=True)

Expand All @@ -250,31 +265,30 @@ def prepare_model_data(
disease=context.disease,
facility_level_nssp_data=context.facility_level_nssp_data,
loc_level_nssp_data=context.loc_level_nssp_data,
loc_level_nwss_data=loc_level_nwss_data,
report_date=context.report_date,
first_training_date=context.first_training_date,
last_training_date=context.last_training_date,
save_dir=data_dir,
logger=logger,
credentials_dict=context.credentials_dict,
nhsn_data_path=nhsn_data_path,
nhsn_data_path=context.nhsn_data_path,
)

# Save evaluation data
logger.info("Getting eval data...")
if eval_data_path is None:
if context.eval_data_path is None:
raise ValueError("No path to an evaluation dataset provided.")

save_eval_data(
loc=context.loc,
disease=context.disease,
first_training_date=context.first_training_date,
last_training_date=context.last_training_date,
latest_comprehensive_path=eval_data_path,
latest_comprehensive_path=context.eval_data_path,
output_data_dir=data_dir,
last_eval_date=context.report_date + timedelta(days=context.n_forecast_days),
credentials_dict=context.credentials_dict,
nhsn_data_path=nhsn_data_path,
nhsn_data_path=context.nhsn_data_path,
)
logger.info("Done getting eval data.")

Expand Down
153 changes: 66 additions & 87 deletions pipelines/epiautogp/forecast_epiautogp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,55 +10,35 @@
run_r_script,
)
from pipelines.epiautogp import convert_to_epiautogp_json
from pipelines.epiautogp.process_epiautogp_forecast import process_epiautogp_forecast
from pipelines.forecast_utils import (
from pipelines.epiautogp.epiautogp_forecast_utils import (
prepare_model_data,
setup_forecast_pipeline,
)
from pipelines.epiautogp.process_epiautogp_forecast import process_epiautogp_forecast


def run_epiautogp_forecast(
json_input_path: Path,
model_dir: Path,
target: str,
n_forecast_weeks: int = 8,
n_particles: int = 24,
n_mcmc: int = 100,
n_hmc: int = 50,
n_forecast_draws: int = 2000,
transformation: str = "boxcox",
smc_data_proportion: float = 0.1,
params: dict,
execution_settings: dict,
) -> None:
"""
Run EpiAutoGP forecasting model using Julia.

Parameters
----------
json_input_path : Path
Path to the JSON input file for EpiAutoGP.
model_dir : Path
Directory containing the model data and where outputs will be saved
target : str
Target data type: "nssp" for ED visit data or "nhsn" for hospital admissions
n_forecast_weeks : int, default=8
Number of weeks to forecast
n_particles : int, default=24
Number of particles for SMC
n_mcmc : int, default=100
Number of MCMC steps for GP kernel structure
n_hmc : int, default=50
Number of HMC steps for GP kernel hyperparameters
n_forecast_draws : int, default=2000
Number of forecast draws
transformation : str, default="boxcox"
Data transformation type
smc_data_proportion : float, default=0.1
Proportion of data used in each SMC step
Directory to save model outputs.
params : dict
Parameters to pass to EpiAutoGP.
execution_settings : dict
Execution settings for the Julia environment.
"""
# Use model_dir directly (not a subdirectory) to match R pipeline expectations
# The R plotting code expects parquet files at model_dir/filename.parquet
output_dir = Path(model_dir)

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)

# Instantiate julia environment for EpiAutoGP
run_julia_code(
Expand All @@ -70,21 +50,21 @@ def run_epiautogp_forecast(
function_name="setup_epiautogp_environment",
)

# Add path arguments to pass to EpiAutoGP
params["json-input"] = str(json_input_path)
params["output-dir"] = str(model_dir)

# Convert Python dict keys (with underscores) to Julia CLI args (with hyphens)
args_to_epiautogp = [
f"--{key.replace('_', '-')}={value}" for key, value in params.items()
]
executor_flags = [f"--{key}={value}" for key, value in execution_settings.items()]

# Run Julia script
run_julia_script(
"EpiAutoGP/run.jl",
[
f"--json-input={json_input_path}",
f"--output-dir={output_dir}",
f"--n-forecast-weeks={n_forecast_weeks}",
f"--n-particles={n_particles}",
f"--n-mcmc={n_mcmc}",
f"--n-hmc={n_hmc}",
f"--n-forecast-draws={n_forecast_draws}",
f"--transformation={transformation}",
f"--smc-data-proportion={smc_data_proportion}",
],
executor_flags=["--project=EpiAutoGP"],
args_to_epiautogp,
executor_flags=executor_flags,
function_name="run_epiautogp_forecast",
)
return None
Expand Down Expand Up @@ -112,17 +92,38 @@ def main(
n_mcmc: int = 100,
n_hmc: int = 50,
n_forecast_draws: int = 2000,
transformation: str = "boxcox",
smc_data_proportion: float = 0.1,
n_threads: int = 1,
) -> None:
# Step 0: Set up logging, model name and params to pass to epiautogp
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Generate model name
model_name = f"epiautogp_{target}"
model_name = f"epiautogp_{target}_{frequency}"
if use_percentage:
model_name += "_pct"

# Declare transformation type
if use_percentage:
transformation = "percentage"
else:
transformation = "boxcox"

# Epiautogp params and execution settings
params = {
"n_particles": n_particles,
"n_mcmc": n_mcmc,
"n_hmc": n_hmc,
"n_forecast_draws": n_forecast_draws,
"transformation": transformation,
"smc_data_proportion": smc_data_proportion,
}
execution_settings = {
"project": "EpiAutoGP",
"threads": n_threads,
}

logger.info(
"Starting single-location EpiAutoGP forecasting pipeline for "
f"location {loc}, and report date {report_date}"
Expand All @@ -134,6 +135,12 @@ def main(
disease=disease,
report_date=report_date,
loc=loc,
target=target,
frequency=frequency,
use_percentage=use_percentage,
model_name=model_name,
eval_data_path=eval_data_path,
nhsn_data_path=nhsn_data_path,
facility_level_nssp_data_dir=facility_level_nssp_data_dir,
state_level_nssp_data_dir=state_level_nssp_data_dir,
output_dir=output_dir,
Expand All @@ -144,71 +151,50 @@ def main(
logger=logger,
)

# Step 2: Prepare data (process location data, eval data, epiweekly data)
# Step 2: Prepare data for modelling (process location data, eval data, epiweekly data)
# returns paths to prepared data files and directories
paths = prepare_model_data(
context=context,
model_name=model_name,
eval_data_path=eval_data_path,
nhsn_data_path=nhsn_data_path,
)

# Step 3: Convert data to EpiAutoGP JSON format
logger.info("Converting data to EpiAutoGP JSON format...")
epiautogp_json_path = Path(paths.data_dir, f"epiautogp_input_{target}.json")

epiautogp_json_path = convert_to_epiautogp_json(
daily_training_data_path=paths.daily_training_data,
epiweekly_training_data_path=paths.epiweekly_training_data,
output_json_path=epiautogp_json_path,
disease=disease,
location=loc,
forecast_date=context.report_date,
target=target,
frequency=frequency,
use_percentage=use_percentage,
logger=logger,
epiautogp_input_json_path = convert_to_epiautogp_json(
context=context,
paths=paths,
)

# Step 4: Run EpiAutoGP forecast
logger.info("Performing EpiAutoGP forecasting...")
run_epiautogp_forecast(
json_input_path=epiautogp_json_path,
json_input_path=epiautogp_input_json_path,
model_dir=paths.model_output_dir,
target=target,
n_forecast_weeks=n_forecast_weeks,
n_particles=n_particles,
n_mcmc=n_mcmc,
n_hmc=n_hmc,
n_forecast_draws=n_forecast_draws,
transformation=transformation,
smc_data_proportion=smc_data_proportion,
params=params,
execution_settings=execution_settings,
)

# Step 5: Process forecast outputs (combine with observed, calculate CIs)
# Step 5: Process forecast outputs (add metadata, calculate CIs)
logger.info("Processing forecast outputs...")
process_epiautogp_forecast(
model_run_dir=context.model_run_dir,
model_name=model_name,
model_name=context.model_name,
target=target,
n_forecast_days=context.n_forecast_days,
save=True,
)
logger.info("Forecast processing complete.")

# Step 6: Create hubverse table
logger.info("Creating hubverse table...")
create_hubverse_table(Path(context.model_run_dir, model_name))
create_hubverse_table(Path(context.model_run_dir, context.model_name))
logger.info("Postprocessing complete.")

# Step 7: Generate forecast plots
# Step 7: Generate forecast plots using EpiAutoGP-specific plotting script
logger.info("Generating forecast plots...")
plot_script = Path(__file__).parent / "plot_epiautogp_forecast.R"
run_r_script(
str(plot_script),
[context.model_run_dir, "--epiautogp-model-name", model_name],
capture_output=False,
text=True,
[str(context.model_run_dir), "--epiautogp-model-name", context.model_name],
function_name="plot_epiautogp_forecast",
)
logger.info("Plotting complete.")

Expand Down Expand Up @@ -292,13 +278,6 @@ def main(
help="Number of forecast draws (default: 2000).",
)

parser.add_argument(
"--transformation",
type=str,
default="boxcox",
help="Data transformation type (default: boxcox).",
)

parser.add_argument(
"--smc-data-proportion",
type=float,
Expand Down
Loading