diff --git a/example_model.py b/example_model.py
index cbbf663..0d38868 100644
--- a/example_model.py
+++ b/example_model.py
@@ -1,13 +1,10 @@
-import time
-start = time.time()
-
 import pandas as pd
 from lightgbm import LGBMRegressor
 import gc
 import json
+from pathlib import Path
 
 from numerapi import NumerAPI
-from halo import Halo
 from utils import (
     save_model,
     load_model,
@@ -21,31 +18,42 @@
 )
 
 
+# download all the things
+
 napi = NumerAPI()
-spinner = Halo(text='', spinner='dots')
 
 current_round = napi.get_current_round()
 
 # Tournament data changes every week so we specify the round in their name. Training
 # and validation data only change periodically, so no need to download them every time.
 print('Downloading dataset files...')
-napi.download_dataset("numerai_training_data.parquet", "training_data.parquet")
-napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet")
-napi.download_dataset("numerai_validation_data.parquet", f"validation_data.parquet")
-napi.download_dataset("example_validation_predictions.parquet")
-napi.download_dataset("features.json")
+
+Path("./v4").mkdir(parents=False, exist_ok=True)
+napi.download_dataset("v4/train.parquet")
+napi.download_dataset("v4/validation.parquet")
+napi.download_dataset("v4/live.parquet")
+napi.download_dataset("v4/validation_example_preds.parquet")
+napi.download_dataset("v4/features.json")
 
 print('Reading minimal training data')
-# read the feature metadata and get the "small" feature set
-with open("features.json", "r") as f:
+# read the feature metadata and get a feature set (or all the features)
+with open("v4/features.json", "r") as f:
     feature_metadata = json.load(f)
-features = feature_metadata["feature_sets"]["small"]
+# features = list(feature_metadata["feature_stats"].keys()) # get all the features
+# features = feature_metadata["feature_sets"]["small"] # get the small feature set
+features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
 # read in just those features along with era and target columns
 read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
 
 # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
 # if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
-training_data = pd.read_parquet('training_data.parquet', columns=read_columns)
+training_data = pd.read_parquet('v4/train.parquet',
+                                columns=read_columns)
+validation_data = pd.read_parquet('v4/validation.parquet',
+                                  columns=read_columns)
+live_data = pd.read_parquet(f'v4/live.parquet',
+                                  columns=read_columns)
+
 
 # pare down the number of eras to every 4th era
 # every_4th_era = training_data[ERA_COL].unique()[::4]
@@ -77,34 +85,27 @@
     model = LGBMRegressor(**params)
 
     # train on all of train and save the model so we don't have to train next time
-    spinner.start('Training model')
     model.fit(training_data.filter(like='feature_', axis='columns'),
               training_data[TARGET_COL])
     print(f"saving new model: {model_name}")
     save_model(model, model_name)
-    spinner.succeed()
 
 gc.collect()
 
-print('Reading minimal features of validation and tournament data...')
-validation_data = pd.read_parquet('validation_data.parquet',
-                                  columns=read_columns)
-tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet',
-                                  columns=read_columns)
-nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()
+nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()
 
 # check for nans and fill nans
 if nans_per_col.any():
-    total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
+    total_rows = len(live_data[live_data["data_type"] == "live"])
     print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
     print(f"out of {total_rows} total rows")
     print(f"filling nans with 0.5")
-    tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
+    live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
+
 else:
     print("No nans in the features this week!")
 
 
-spinner.start('Predicting on validation and tournament data')
 # double check the feature that the model expects vs what is available to prevent our
 # pipeline from failing if Numerai adds more data and we don't have time to retrain!
 model_expected_features = model.booster_.feature_name()
@@ -112,14 +113,11 @@
     print(f"New features are available! Might want to retrain model {model_name}.")
 validation_data.loc[:, f"preds_{model_name}"] = model.predict(
     validation_data.loc[:, model_expected_features])
-tournament_data.loc[:, f"preds_{model_name}"] = model.predict(
-    tournament_data.loc[:, model_expected_features])
-spinner.succeed()
+live_data.loc[:, f"preds_{model_name}"] = model.predict(
+    live_data.loc[:, model_expected_features])
 
 gc.collect()
 
-spinner.start('Neutralizing to risky features')
-
 # neutralize our predictions to the riskiest features
 validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
     df=validation_data,
@@ -130,33 +128,29 @@
     era_col=ERA_COL
 )
 
-tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
-    df=tournament_data,
+live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
+    df=live_data,
     columns=[f"preds_{model_name}"],
     neutralizers=riskiest_features,
     proportion=1.0,
     normalize=True,
     era_col=ERA_COL
 )
-spinner.succeed()
-
 
 model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
 
 # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
 validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
-tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
+live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
 validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
-tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv")
+live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")
 
-spinner.start('Reading example validation predictions')
-validation_preds = pd.read_parquet('example_validation_predictions.parquet')
+validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
 validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
-spinner.succeed()
 
 # get some stats about each of our models to compare...
 # fast_mode=True so that we skip some of the stats that are slower to calculate
-validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
+validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
 print(validation_stats[["mean", "sharpe"]].to_markdown())
 
 print(f'''
@@ -165,5 +159,3 @@
     2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
     3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
 ''')
-
-print(f'done in {(time.time() - start) / 60} mins')
diff --git a/example_model_advanced.py b/example_model_advanced.py
index e8fc9d8..428e5e6 100644
--- a/example_model_advanced.py
+++ b/example_model_advanced.py
@@ -2,12 +2,22 @@
 from lightgbm import LGBMRegressor
 import gc
 from numerapi import NumerAPI
-from utils import save_prediction, save_model, load_model, neutralize, get_biggest_change_features, validation_metrics, download_data, \
-    load_model_config, save_model_config, get_time_series_cross_val_splits
+from pathlib import Path
+from utils import (
+    save_model,
+    load_model,
+    neutralize,
+    get_biggest_change_features,
+    get_time_series_cross_val_splits,
+    validation_metrics,
+    load_model_config,
+    save_model_config,
+    save_prediction,
+    TARGET_COL,
+)
 
 
 EXAMPLE_PREDS_COL = "example_preds"
-TARGET_COL = "target"
 ERA_COL = "era"
 # params we'll use to train all of our models.
 # Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
@@ -21,7 +31,7 @@
 # a value of 1 means no downsampling
 # a value of 10 means use every 10th row
 downsample_cross_val = 20
-downsample_full_train = 1
+downsample_full_train = 2
 
 # if model_selection_loop=True get OOS performance for training_data
 # and use that to select best model
@@ -33,14 +43,16 @@
 
 current_round = napi.get_current_round()
 
+Path("./v4").mkdir(parents=False, exist_ok=True)
+napi.download_dataset("v4/train.parquet")
+napi.download_dataset("v4/features.json")
+
+
 print("Entering model selection loop.  This may take awhile.")
 if model_selection_loop:
     model_config = {}
-    print('downloading training_data')
-    napi.download_dataset("numerai_training_data.parquet")
-
-    print("reading training data from local file")
-    training_data = pd.read_parquet('numerai_training_data.parquet')
+    print('reading training_data')
+    training_data = pd.read_parquet('v4/train.parquet')
 
     # keep track of some prediction columns
     ensemble_cols = set()
@@ -50,7 +62,7 @@
     possible_targets = [c for c in training_data.columns if c.startswith("target_")]
     # randomly pick a handful of targets
     # this can be vastly improved
-    targets = ["target", "target_nomi_60", "target_jerome_20"]
+    targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
 
     # all the possible features to train on
     feature_cols = [c for c in training_data if c.startswith("feature_")]
@@ -136,7 +148,7 @@
     # use example_col preds_model_target as an estimates since no example preds provided for training
     # fast_mode=True so that we skip some of the stats that are slower to calculate
     training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
-                                        fast_mode=True)
+                                        fast_mode=True, target_col=TARGET_COL)
     print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
 
     # pick the model that has the highest correlation sharpe
@@ -183,35 +195,28 @@
 
 """ Things that we always do even if we've already trained """
 gc.collect()
-print("downloading tournament_data")
-napi.download_dataset("numerai_tournament_data.parquet", f"numerai_tournament_data_{current_round}.parquet")
-print("downloading validation_data")
-napi.download_dataset("numerai_validation_data.parquet")
-print("downloading example_predictions")
-napi.download_dataset('example_predictions.parquet', f'example_predictions_{current_round}.parquet')
-print("downloading example_validation_predictions")
-napi.download_dataset('example_validation_predictions.parquet')
 
 print("reading tournament_data")
-tournament_data = pd.read_parquet(f'numerai_tournament_data_{current_round}.parquet')
+live_data = pd.read_parquet('v4/live.parquet')
 print("reading validation_data")
-validation_data = pd.read_parquet('numerai_validation_data.parquet')
+validation_data = pd.read_parquet('v4/validation.parquet')
 print("reading example_predictions")
-example_preds = pd.read_parquet(f'example_predictions_{current_round}.parquet')
+example_preds = pd.read_parquet('v4/live_example_preds.parquet')
 print("reading example_validaton_predictions")
-validation_example_preds = pd.read_parquet('example_validation_predictions.parquet')
+validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
 # set the example predictions
 validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
 
 # check for nans and fill nans
 print("checking for nans in the tournament data")
-if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
-    cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
-    total_rows = tournament_data[tournament_data["data_type"] == "live"]
+if live_data.loc[:, feature_cols].isna().sum().sum():
+    cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
+    total_rows = len(live_data)
     print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
     print(f"out of {total_rows} total rows")
     print(f"filling nans with 0.5")
-    tournament_data.loc[:, feature_cols] = tournament_data.loc[:, feature_cols].fillna(0.5)
+    live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
+
 else:
     print("No nans in the features this week!")
 
@@ -231,7 +236,7 @@
         print(f"New features are available! Might want to retrain model {model_name}.")
     print(f"predicting tournament and validation for {model_name}")
     validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
-    tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
+    live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
 
     # do different neutralizations
     # neutralize our predictions to the riskiest features only
@@ -242,7 +247,7 @@
                                                                             proportion=1.0,
                                                                             normalize=True,
                                                                             era_col=ERA_COL)[f"preds_{model_name}"]
-    tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
+    live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
                                                                             columns=[f"preds_{model_name}"],
                                                                             neutralizers=riskiest_features,
                                                                             proportion=1.0,
@@ -255,36 +260,37 @@
 
 # rank per era for each prediction column so that we can combine safely
 validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
-tournament_data[list(pred_cols)] = tournament_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
+live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
 # make ensembles for val and tournament
 print('creating ensembles for tournament and validation')
 validation_data["ensemble_neutral_riskiest_50"] = sum(
     [validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
     pct=True)
-tournament_data["ensemble_neutral_riskiest_50"] = sum(
-    [tournament_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
+live_data["ensemble_neutral_riskiest_50"] = sum(
+    [live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
     pct=True)
 ensemble_cols.add("ensemble_neutral_riskiest_50")
 
 validation_data["ensemble_not_neutral"] = sum(
     [validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
-tournament_data["ensemble_not_neutral"] = sum(
-    [tournament_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
+live_data["ensemble_not_neutral"] = sum(
+    [live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
 ensemble_cols.add("ensemble_not_neutral")
 
 validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
-tournament_data["ensemble_all"] = sum([tournament_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
+live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
+
 ensemble_cols.add("ensemble_all")
 
 gc.collect()
 print("getting final validation stats")
 # get our final validation stats for our chosen model
-validation_stats = validation_metrics(validation_data, [best_pred_col], example_col=EXAMPLE_PREDS_COL,
-                                      fast_mode=False)
+validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
+                                      fast_mode=False, target_col=TARGET_COL)
 print(validation_stats.to_markdown())
 
 # rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
 validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
-tournament_data["prediction"] = tournament_data[best_pred_col].rank(pct=True)
+live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
 save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
-save_prediction(tournament_data["prediction"], f"tournament_predictions_{current_round}")
+save_prediction(live_data["prediction"], f"live_data_{current_round}")
diff --git a/utils.py b/utils.py
index 6133d73..ccea7e0 100644
--- a/utils.py
+++ b/utils.py
@@ -1,15 +1,13 @@
-import os
-import requests
 import numpy as np
 import pandas as pd
 import scipy
 from halo import Halo
 from pathlib import Path
 import json
-from scipy.stats import skew, kurtosis
+from scipy.stats import skew
 
 ERA_COL = "era"
-TARGET_COL = "target_nomi_20"
+TARGET_COL = "target_nomi_v4_20"
 DATA_TYPE_COL = "data_type"
 EXAMPLE_PREDS_COL = "example_preds"
 
@@ -19,6 +17,7 @@
 MODEL_CONFIGS_FOLDER = "model_configs"
 PREDICTION_FILES_FOLDER = "prediction_files"
 
+
 def save_prediction(df, name):
     try:
         Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
@@ -26,6 +25,7 @@ def save_prediction(df, name):
         pass
     df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
 
+
 def save_model(model, name):
     try:
         Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
@@ -76,7 +76,7 @@ def get_biggest_change_features(corrs, n):
     return worst_n
 
 
-def get_time_series_cross_val_splits(data, cv = 3, embargo = 12):
+def get_time_series_cross_val_splits(data, cv=3, embargo=12):
     all_train_eras = data[ERA_COL].unique()
     len_split = len(all_train_eras) // cv
     test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
@@ -126,7 +126,7 @@ def neutralize(df,
         exposures = df_era[neutralizers].values
 
         scores -= proportion * exposures.dot(
-            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))
+            np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32)))
 
         scores /= scores.std(ddof=0)
 
@@ -158,12 +158,12 @@ def unif(df):
     return pd.Series(x, index=df.index)
 
 
-def get_feature_neutral_mean(df, prediction_col):
+def get_feature_neutral_mean(df, prediction_col, target_col):
     feature_cols = [c for c in df.columns if c.startswith("feature")]
     df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col],
                                           feature_cols)[prediction_col]
     scores = df.groupby("era").apply(
-        lambda x: (unif(x["neutral_sub"]).corr(x[TARGET_COL]))).mean()
+        lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean()
     return np.mean(scores)
 
 
@@ -188,13 +188,13 @@ def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
     return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
 
 
-def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False):
+def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False, target_col=TARGET_COL):
     validation_stats = pd.DataFrame()
     feature_cols = [c for c in validation_data if c.startswith("feature_")]
     for pred_col in pred_cols:
         # Check the per-era correlations on the validation set (out of sample)
         validation_correlations = validation_data.groupby(ERA_COL).apply(
-            lambda d: unif(d[pred_col]).corr(d[TARGET_COL]))
+            lambda d: unif(d[pred_col]).corr(d[target_col]))
 
         mean = validation_correlations.mean()
         std = validation_correlations.std(ddof=0)
@@ -214,13 +214,13 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False)
         payout_daily_value = (payout_scores + 1).cumprod()
 
         apy = (
-            (
-                (payout_daily_value.dropna().iloc[-1])
-                ** (1 / len(payout_scores))
-            )
-            ** 49  # 52 weeks of compounding minus 3 for stake compounding lag
-            - 1
-        ) * 100
+                      (
+                              (payout_daily_value.dropna().iloc[-1])
+                              ** (1 / len(payout_scores))
+                      )
+                      ** 49  # 52 weeks of compounding minus 3 for stake compounding lag
+                      - 1
+              ) * 100
 
         validation_stats.loc["apy", pred_col] = apy
 
@@ -232,14 +232,14 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False)
             validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure
 
             # Check feature neutral mean
-            feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col)
+            feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col, target_col)
             validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean
 
             # Check top and bottom 200 metrics (TB200)
             tb200_validation_correlations = fast_score_by_date(
                 validation_data,
                 [pred_col],
-                TARGET_COL,
+                target_col,
                 tb=200,
                 era_col=ERA_COL
             )
@@ -257,8 +257,8 @@ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False)
         corr_scores = []
         for _, x in validation_data.groupby(ERA_COL):
             series = neutralize_series(unif(x[pred_col]), (x[example_col]))
-            mmc_scores.append(np.cov(series, x[TARGET_COL])[0, 1] / (0.29 ** 2))
-            corr_scores.append(unif(x[pred_col]).corr(x[TARGET_COL]))
+            mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2))
+            corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
 
         val_mmc_mean = np.mean(mmc_scores)
         val_mmc_std = np.std(mmc_scores)