diff --git a/example_model.py b/example_model.py index cbbf663..0d38868 100644 --- a/example_model.py +++ b/example_model.py @@ -1,13 +1,10 @@ -import time -start = time.time() - import pandas as pd from lightgbm import LGBMRegressor import gc import json +from pathlib import Path from numerapi import NumerAPI -from halo import Halo from utils import ( save_model, load_model, @@ -21,31 +18,42 @@ ) +# download all the things + napi = NumerAPI() -spinner = Halo(text='', spinner='dots') current_round = napi.get_current_round() # Tournament data changes every week so we specify the round in their name. Training # and validation data only change periodically, so no need to download them every time. print('Downloading dataset files...') -napi.download_dataset("numerai_training_data.parquet", "training_data.parquet") -napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet") -napi.download_dataset("numerai_validation_data.parquet", f"validation_data.parquet") -napi.download_dataset("example_validation_predictions.parquet") -napi.download_dataset("features.json") + +Path("./v4").mkdir(parents=False, exist_ok=True) +napi.download_dataset("v4/train.parquet") +napi.download_dataset("v4/validation.parquet") +napi.download_dataset("v4/live.parquet") +napi.download_dataset("v4/validation_example_preds.parquet") +napi.download_dataset("v4/features.json") print('Reading minimal training data') -# read the feature metadata and get the "small" feature set -with open("features.json", "r") as f: +# read the feature metadata and get a feature set (or all the features) +with open("v4/features.json", "r") as f: feature_metadata = json.load(f) -features = feature_metadata["feature_sets"]["small"] +# features = list(feature_metadata["feature_stats"].keys()) # get all the features +# features = feature_metadata["feature_sets"]["small"] # get the small feature set +features = feature_metadata["feature_sets"]["medium"] # get the medium feature set # read in just those features along with era and target columns read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL] # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes... # if so, delete the file and rerun the napi.download_dataset to fix the corrupted file -training_data = pd.read_parquet('training_data.parquet', columns=read_columns) +training_data = pd.read_parquet('v4/train.parquet', + columns=read_columns) +validation_data = pd.read_parquet('v4/validation.parquet', + columns=read_columns) +live_data = pd.read_parquet(f'v4/live.parquet', + columns=read_columns) + # pare down the number of eras to every 4th era # every_4th_era = training_data[ERA_COL].unique()[::4] @@ -77,34 +85,27 @@ model = LGBMRegressor(**params) # train on all of train and save the model so we don't have to train next time - spinner.start('Training model') model.fit(training_data.filter(like='feature_', axis='columns'), training_data[TARGET_COL]) print(f"saving new model: {model_name}") save_model(model, model_name) - spinner.succeed() gc.collect() -print('Reading minimal features of validation and tournament data...') -validation_data = pd.read_parquet('validation_data.parquet', - columns=read_columns) -tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet', - columns=read_columns) -nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum() +nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum() # check for nans and fill nans if nans_per_col.any(): - total_rows = len(tournament_data[tournament_data["data_type"] == "live"]) + total_rows = len(live_data[live_data["data_type"] == "live"]) print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}") print(f"out of {total_rows} total rows") print(f"filling nans with 0.5") - tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5) + live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5) + else: print("No nans in the features this week!") -spinner.start('Predicting on validation and tournament data') # double check the feature that the model expects vs what is available to prevent our # pipeline from failing if Numerai adds more data and we don't have time to retrain! model_expected_features = model.booster_.feature_name() @@ -112,14 +113,11 @@ print(f"New features are available! Might want to retrain model {model_name}.") validation_data.loc[:, f"preds_{model_name}"] = model.predict( validation_data.loc[:, model_expected_features]) -tournament_data.loc[:, f"preds_{model_name}"] = model.predict( - tournament_data.loc[:, model_expected_features]) -spinner.succeed() +live_data.loc[:, f"preds_{model_name}"] = model.predict( + live_data.loc[:, model_expected_features]) gc.collect() -spinner.start('Neutralizing to risky features') - # neutralize our predictions to the riskiest features validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize( df=validation_data, @@ -130,33 +128,29 @@ era_col=ERA_COL ) -tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize( - df=tournament_data, +live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize( + df=live_data, columns=[f"preds_{model_name}"], neutralizers=riskiest_features, proportion=1.0, normalize=True, era_col=ERA_COL ) -spinner.succeed() - model_to_submit = f"preds_{model_name}_neutral_riskiest_50" # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True) -tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True) +live_data["prediction"] = live_data[model_to_submit].rank(pct=True) validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv") -tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv") +live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv") -spinner.start('Reading example validation predictions') -validation_preds = pd.read_parquet('example_validation_predictions.parquet') +validation_preds = pd.read_parquet('v4/validation_example_preds.parquet') validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"] -spinner.succeed() # get some stats about each of our models to compare... # fast_mode=True so that we skip some of the stats that are slower to calculate -validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True) +validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL) print(validation_stats[["mean", "sharpe"]].to_markdown()) print(f''' @@ -165,5 +159,3 @@ 2. Submit validation_predictions_{current_round}.csv to the diagnostics tool 3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button ''') - -print(f'done in {(time.time() - start) / 60} mins') diff --git a/example_model_advanced.py b/example_model_advanced.py index e8fc9d8..428e5e6 100644 --- a/example_model_advanced.py +++ b/example_model_advanced.py @@ -2,12 +2,22 @@ from lightgbm import LGBMRegressor import gc from numerapi import NumerAPI -from utils import save_prediction, save_model, load_model, neutralize, get_biggest_change_features, validation_metrics, download_data, \ - load_model_config, save_model_config, get_time_series_cross_val_splits +from pathlib import Path +from utils import ( + save_model, + load_model, + neutralize, + get_biggest_change_features, + get_time_series_cross_val_splits, + validation_metrics, + load_model_config, + save_model_config, + save_prediction, + TARGET_COL, +) EXAMPLE_PREDS_COL = "example_preds" -TARGET_COL = "target" ERA_COL = "era" # params we'll use to train all of our models. # Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is @@ -21,7 +31,7 @@ # a value of 1 means no downsampling # a value of 10 means use every 10th row downsample_cross_val = 20 -downsample_full_train = 1 +downsample_full_train = 2 # if model_selection_loop=True get OOS performance for training_data # and use that to select best model @@ -33,14 +43,16 @@ current_round = napi.get_current_round() +Path("./v4").mkdir(parents=False, exist_ok=True) +napi.download_dataset("v4/train.parquet") +napi.download_dataset("v4/features.json") + + print("Entering model selection loop. This may take awhile.") if model_selection_loop: model_config = {} - print('downloading training_data') - napi.download_dataset("numerai_training_data.parquet") - - print("reading training data from local file") - training_data = pd.read_parquet('numerai_training_data.parquet') + print('reading training_data') + training_data = pd.read_parquet('v4/train.parquet') # keep track of some prediction columns ensemble_cols = set() @@ -50,7 +62,7 @@ possible_targets = [c for c in training_data.columns if c.startswith("target_")] # randomly pick a handful of targets # this can be vastly improved - targets = ["target", "target_nomi_60", "target_jerome_20"] + targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"] # all the possible features to train on feature_cols = [c for c in training_data if c.startswith("feature_")] @@ -136,7 +148,7 @@ # use example_col preds_model_target as an estimates since no example preds provided for training # fast_mode=True so that we skip some of the stats that are slower to calculate training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target", - fast_mode=True) + fast_mode=True, target_col=TARGET_COL) print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown()) # pick the model that has the highest correlation sharpe @@ -183,35 +195,28 @@ """ Things that we always do even if we've already trained """ gc.collect() -print("downloading tournament_data") -napi.download_dataset("numerai_tournament_data.parquet", f"numerai_tournament_data_{current_round}.parquet") -print("downloading validation_data") -napi.download_dataset("numerai_validation_data.parquet") -print("downloading example_predictions") -napi.download_dataset('example_predictions.parquet', f'example_predictions_{current_round}.parquet') -print("downloading example_validation_predictions") -napi.download_dataset('example_validation_predictions.parquet') print("reading tournament_data") -tournament_data = pd.read_parquet(f'numerai_tournament_data_{current_round}.parquet') +live_data = pd.read_parquet('v4/live.parquet') print("reading validation_data") -validation_data = pd.read_parquet('numerai_validation_data.parquet') +validation_data = pd.read_parquet('v4/validation.parquet') print("reading example_predictions") -example_preds = pd.read_parquet(f'example_predictions_{current_round}.parquet') +example_preds = pd.read_parquet('v4/live_example_preds.parquet') print("reading example_validaton_predictions") -validation_example_preds = pd.read_parquet('example_validation_predictions.parquet') +validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet') # set the example predictions validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"] # check for nans and fill nans print("checking for nans in the tournament data") -if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum(): - cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum() - total_rows = tournament_data[tournament_data["data_type"] == "live"] +if live_data.loc[:, feature_cols].isna().sum().sum(): + cols_w_nan = live_data.loc[:, feature_cols].isna().sum() + total_rows = len(live_data) print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}") print(f"out of {total_rows} total rows") print(f"filling nans with 0.5") - tournament_data.loc[:, feature_cols] = tournament_data.loc[:, feature_cols].fillna(0.5) + live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5) + else: print("No nans in the features this week!") @@ -231,7 +236,7 @@ print(f"New features are available! Might want to retrain model {model_name}.") print(f"predicting tournament and validation for {model_name}") validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features]) - tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features]) + live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features]) # do different neutralizations # neutralize our predictions to the riskiest features only @@ -242,7 +247,7 @@ proportion=1.0, normalize=True, era_col=ERA_COL)[f"preds_{model_name}"] - tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data, + live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data, columns=[f"preds_{model_name}"], neutralizers=riskiest_features, proportion=1.0, @@ -255,36 +260,37 @@ # rank per era for each prediction column so that we can combine safely validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True)) -tournament_data[list(pred_cols)] = tournament_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True)) +live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True)) # make ensembles for val and tournament print('creating ensembles for tournament and validation') validation_data["ensemble_neutral_riskiest_50"] = sum( [validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( pct=True) -tournament_data["ensemble_neutral_riskiest_50"] = sum( - [tournament_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( +live_data["ensemble_neutral_riskiest_50"] = sum( + [live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( pct=True) ensemble_cols.add("ensemble_neutral_riskiest_50") validation_data["ensemble_not_neutral"] = sum( [validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) -tournament_data["ensemble_not_neutral"] = sum( - [tournament_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) +live_data["ensemble_not_neutral"] = sum( + [live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) ensemble_cols.add("ensemble_not_neutral") validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True) -tournament_data["ensemble_all"] = sum([tournament_data[pred_col] for pred_col in pred_cols]).rank(pct=True) +live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True) + ensemble_cols.add("ensemble_all") gc.collect() print("getting final validation stats") # get our final validation stats for our chosen model -validation_stats = validation_metrics(validation_data, [best_pred_col], example_col=EXAMPLE_PREDS_COL, - fast_mode=False) +validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL, + fast_mode=False, target_col=TARGET_COL) print(validation_stats.to_markdown()) # rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True) -tournament_data["prediction"] = tournament_data[best_pred_col].rank(pct=True) +live_data["prediction"] = live_data[best_pred_col].rank(pct=True) save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}") -save_prediction(tournament_data["prediction"], f"tournament_predictions_{current_round}") +save_prediction(live_data["prediction"], f"live_data_{current_round}") diff --git a/utils.py b/utils.py index 6133d73..ccea7e0 100644 --- a/utils.py +++ b/utils.py @@ -1,15 +1,13 @@ -import os -import requests import numpy as np import pandas as pd import scipy from halo import Halo from pathlib import Path import json -from scipy.stats import skew, kurtosis +from scipy.stats import skew ERA_COL = "era" -TARGET_COL = "target_nomi_20" +TARGET_COL = "target_nomi_v4_20" DATA_TYPE_COL = "data_type" EXAMPLE_PREDS_COL = "example_preds" @@ -19,6 +17,7 @@ MODEL_CONFIGS_FOLDER = "model_configs" PREDICTION_FILES_FOLDER = "prediction_files" + def save_prediction(df, name): try: Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True) @@ -26,6 +25,7 @@ def save_prediction(df, name): pass df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True) + def save_model(model, name): try: Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True) @@ -76,7 +76,7 @@ def get_biggest_change_features(corrs, n): return worst_n -def get_time_series_cross_val_splits(data, cv = 3, embargo = 12): +def get_time_series_cross_val_splits(data, cv=3, embargo=12): all_train_eras = data[ERA_COL].unique() len_split = len(all_train_eras) // cv test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)] @@ -126,7 +126,7 @@ def neutralize(df, exposures = df_era[neutralizers].values scores -= proportion * exposures.dot( - np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32))) + np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32))) scores /= scores.std(ddof=0) @@ -158,12 +158,12 @@ def unif(df): return pd.Series(x, index=df.index) -def get_feature_neutral_mean(df, prediction_col): +def get_feature_neutral_mean(df, prediction_col, target_col): feature_cols = [c for c in df.columns if c.startswith("feature")] df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col], feature_cols)[prediction_col] scores = df.groupby("era").apply( - lambda x: (unif(x["neutral_sub"]).corr(x[TARGET_COL]))).mean() + lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean() return np.mean(scores) @@ -188,13 +188,13 @@ def fast_score_by_date(df, columns, target, tb=None, era_col="era"): return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique()) -def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False): +def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False, target_col=TARGET_COL): validation_stats = pd.DataFrame() feature_cols = [c for c in validation_data if c.startswith("feature_")] for pred_col in pred_cols: # Check the per-era correlations on the validation set (out of sample) validation_correlations = validation_data.groupby(ERA_COL).apply( - lambda d: unif(d[pred_col]).corr(d[TARGET_COL])) + lambda d: unif(d[pred_col]).corr(d[target_col])) mean = validation_correlations.mean() std = validation_correlations.std(ddof=0) @@ -214,13 +214,13 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False) payout_daily_value = (payout_scores + 1).cumprod() apy = ( - ( - (payout_daily_value.dropna().iloc[-1]) - ** (1 / len(payout_scores)) - ) - ** 49 # 52 weeks of compounding minus 3 for stake compounding lag - - 1 - ) * 100 + ( + (payout_daily_value.dropna().iloc[-1]) + ** (1 / len(payout_scores)) + ) + ** 49 # 52 weeks of compounding minus 3 for stake compounding lag + - 1 + ) * 100 validation_stats.loc["apy", pred_col] = apy @@ -232,14 +232,14 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False) validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure # Check feature neutral mean - feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col) + feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col, target_col) validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean # Check top and bottom 200 metrics (TB200) tb200_validation_correlations = fast_score_by_date( validation_data, [pred_col], - TARGET_COL, + target_col, tb=200, era_col=ERA_COL ) @@ -257,8 +257,8 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False) corr_scores = [] for _, x in validation_data.groupby(ERA_COL): series = neutralize_series(unif(x[pred_col]), (x[example_col])) - mmc_scores.append(np.cov(series, x[TARGET_COL])[0, 1] / (0.29 ** 2)) - corr_scores.append(unif(x[pred_col]).corr(x[TARGET_COL])) + mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2)) + corr_scores.append(unif(x[pred_col]).corr(x[target_col])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores)