Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 38 additions & 46 deletions example_model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import time
start = time.time()

import pandas as pd
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path

from numerapi import NumerAPI
from halo import Halo
from utils import (
save_model,
load_model,
Expand All @@ -16,44 +13,55 @@
validation_metrics,
ERA_COL,
DATA_TYPE_COL,
TARGET_COL,
TARGET_COL_V4,
EXAMPLE_PREDS_COL
)


# download all the things

napi = NumerAPI()
spinner = Halo(text='', spinner='dots')

current_round = napi.get_current_round()

# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')
napi.download_dataset("numerai_training_data.parquet", "training_data.parquet")
napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet")
napi.download_dataset("numerai_validation_data.parquet", f"validation_data.parquet")
napi.download_dataset("example_validation_predictions.parquet")
napi.download_dataset("features.json")

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/validation.parquet")
napi.download_dataset("v4/live.parquet")
napi.download_dataset("v4/validation_example_preds.parquet")
napi.download_dataset("v4/features.json")

print('Reading minimal training data')
# read the feature metadata and get the "small" feature set
with open("features.json", "r") as f:
# read the feature metadata and get a feature set (or all the features)
with open("v4/features.json", "r") as f:
feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL_V4]

# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet('training_data.parquet', columns=read_columns)
training_data = pd.read_parquet('v4/train.parquet',
columns=read_columns)
validation_data = pd.read_parquet('v4/validation.parquet',
columns=read_columns)
live_data = pd.read_parquet(f'v4/live.parquet',
columns=read_columns)


# pare down the number of eras to every 4th era
# every_4th_era = training_data[ERA_COL].unique()[::4]
# training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]

# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
lambda era: era[features].corrwith(era[TARGET_COL])
lambda era: era[features].corrwith(era[TARGET_COL_V4])
)

# find the riskiest features by comparing their correlation vs
Expand All @@ -77,49 +85,39 @@
model = LGBMRegressor(**params)

# train on all of train and save the model so we don't have to train next time
spinner.start('Training model')
model.fit(training_data.filter(like='feature_', axis='columns'),
training_data[TARGET_COL])
training_data[TARGET_COL_V4])
print(f"saving new model: {model_name}")
save_model(model, model_name)
spinner.succeed()

gc.collect()

print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('validation_data.parquet',
columns=read_columns)
tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet',
columns=read_columns)
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()
nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()

# check for nans and fill nans
if nans_per_col.any():
total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
total_rows = len(live_data[live_data["data_type"] == "live"])
print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
print(f"out of {total_rows} total rows")
print(f"filling nans with 0.5")
tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)

else:
print("No nans in the features this week!")


spinner.start('Predicting on validation and tournament data')
# double check the feature that the model expects vs what is available to prevent our
# pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(features):
print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(
tournament_data.loc[:, model_expected_features])
spinner.succeed()
live_data.loc[:, f"preds_{model_name}"] = model.predict(
live_data.loc[:, model_expected_features])

gc.collect()

spinner.start('Neutralizing to risky features')

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=validation_data,
Expand All @@ -130,33 +128,29 @@
era_col=ERA_COL
)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=tournament_data,
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=live_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
normalize=True,
era_col=ERA_COL
)
spinner.succeed()


model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv")
live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

spinner.start('Reading example validation predictions')
validation_preds = pd.read_parquet('example_validation_predictions.parquet')
validation_preds = pd.read_parquet('v4/example_validation_preds.parquet')
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
spinner.succeed()

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL_V4)
print(validation_stats[["mean", "sharpe"]].to_markdown())

print(f'''
Expand All @@ -165,5 +159,3 @@
2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
''')

print(f'done in {(time.time() - start) / 60} mins')
88 changes: 47 additions & 41 deletions example_model_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@
from lightgbm import LGBMRegressor
import gc
from numerapi import NumerAPI
from utils import save_prediction, save_model, load_model, neutralize, get_biggest_change_features, validation_metrics, download_data, \
load_model_config, save_model_config, get_time_series_cross_val_splits
from pathlib import Path
from utils import (
save_model,
load_model,
neutralize,
get_biggest_change_features,
get_time_series_cross_val_splits,
validation_metrics,
load_model_config,
save_model_config,
save_prediction,
TARGET_COL_V4,
)


EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target"
ERA_COL = "era"
# params we'll use to train all of our models.
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
Expand All @@ -21,26 +31,28 @@
# a value of 1 means no downsampling
# a value of 10 means use every 10th row
downsample_cross_val = 20
downsample_full_train = 1
downsample_full_train = 2

# if model_selection_loop=True get OOS performance for training_data
# and use that to select best model
# if model_selection_loop=False, just predict on tournament data using existing models and model config
model_selection_loop = True
model_selection_loop = False
model_config_name = "advanced_example_model"

napi = NumerAPI()

current_round = napi.get_current_round()

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/features.json")


print("Entering model selection loop. This may take awhile.")
if model_selection_loop:
model_config = {}
print('downloading training_data')
napi.download_dataset("numerai_training_data.parquet")

print("reading training data from local file")
training_data = pd.read_parquet('numerai_training_data.parquet')
training_data = pd.read_parquet('v4/train.parquet')

# keep track of some prediction columns
ensemble_cols = set()
Expand All @@ -50,7 +62,7 @@
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
# randomly pick a handful of targets
# this can be vastly improved
targets = ["target", "target_nomi_60", "target_jerome_20"]
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]

# all the possible features to train on
feature_cols = [c for c in training_data if c.startswith("feature_")]
Expand All @@ -72,7 +84,7 @@
# getting the per era correlation of each feature vs the primary target across the training split
print("getting feature correlations over time and identifying riskiest features")
all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
lambda d: d[feature_cols].corrwith(d[TARGET_COL_V4]))
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
# there are probably more clever ways to do this
riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
Expand Down Expand Up @@ -136,7 +148,7 @@
# use example_col preds_model_target as an estimates since no example preds provided for training
# fast_mode=True so that we skip some of the stats that are slower to calculate
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
fast_mode=True)
fast_mode=True, target_col=TARGET_COL_V4)
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())

# pick the model that has the highest correlation sharpe
Expand All @@ -148,7 +160,7 @@
# getting the per era correlation of each feature vs the target across all of training data
print("getting feature correlations with target and identifying riskiest features")
all_feature_corrs = training_data.groupby(ERA_COL).apply(
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
lambda d: d[feature_cols].corrwith(d[TARGET_COL_V4]))
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

Expand Down Expand Up @@ -183,35 +195,28 @@

""" Things that we always do even if we've already trained """
gc.collect()
print("downloading tournament_data")
napi.download_dataset("numerai_tournament_data.parquet", f"numerai_tournament_data_{current_round}.parquet")
print("downloading validation_data")
napi.download_dataset("numerai_validation_data.parquet")
print("downloading example_predictions")
napi.download_dataset('example_predictions.parquet', f'example_predictions_{current_round}.parquet')
print("downloading example_validation_predictions")
napi.download_dataset('example_validation_predictions.parquet')

print("reading tournament_data")
tournament_data = pd.read_parquet(f'numerai_tournament_data_{current_round}.parquet')
live_data = pd.read_parquet('v4/live.parquet')
print("reading validation_data")
validation_data = pd.read_parquet('numerai_validation_data.parquet')
validation_data = pd.read_parquet('v4/validation.parquet')
print("reading example_predictions")
example_preds = pd.read_parquet(f'example_predictions_{current_round}.parquet')
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
print("reading example_validaton_predictions")
validation_example_preds = pd.read_parquet('example_validation_predictions.parquet')
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
# set the example predictions
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]

# check for nans and fill nans
print("checking for nans in the tournament data")
if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
total_rows = tournament_data[tournament_data["data_type"] == "live"]
if live_data.loc[:, feature_cols].isna().sum().sum():
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
total_rows = len(live_data)
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
print(f"out of {total_rows} total rows")
print(f"filling nans with 0.5")
tournament_data.loc[:, feature_cols] = tournament_data.loc[:, feature_cols].fillna(0.5)
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)

else:
print("No nans in the features this week!")

Expand All @@ -231,7 +236,7 @@
print(f"New features are available! Might want to retrain model {model_name}.")
print(f"predicting tournament and validation for {model_name}")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])

# do different neutralizations
# neutralize our predictions to the riskiest features only
Expand All @@ -242,7 +247,7 @@
proportion=1.0,
normalize=True,
era_col=ERA_COL)[f"preds_{model_name}"]
tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
Expand All @@ -255,36 +260,37 @@

# rank per era for each prediction column so that we can combine safely
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
tournament_data[list(pred_cols)] = tournament_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
# make ensembles for val and tournament
print('creating ensembles for tournament and validation')
validation_data["ensemble_neutral_riskiest_50"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
tournament_data["ensemble_neutral_riskiest_50"] = sum(
[tournament_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
live_data["ensemble_neutral_riskiest_50"] = sum(
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
ensemble_cols.add("ensemble_neutral_riskiest_50")

validation_data["ensemble_not_neutral"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
tournament_data["ensemble_not_neutral"] = sum(
[tournament_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
live_data["ensemble_not_neutral"] = sum(
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
ensemble_cols.add("ensemble_not_neutral")

validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
tournament_data["ensemble_all"] = sum([tournament_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

ensemble_cols.add("ensemble_all")

gc.collect()
print("getting final validation stats")
# get our final validation stats for our chosen model
validation_stats = validation_metrics(validation_data, [best_pred_col], example_col=EXAMPLE_PREDS_COL,
fast_mode=False)
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
fast_mode=False, target_col=TARGET_COL_V4)
print(validation_stats.to_markdown())

# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
tournament_data["prediction"] = tournament_data[best_pred_col].rank(pct=True)
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
save_prediction(tournament_data["prediction"], f"tournament_predictions_{current_round}")
save_prediction(live_data["prediction"], f"live_data_{current_round}")
Loading