diff --git a/run_modal.py b/run_modal.py index 4675c1cb..669d47f5 100644 --- a/run_modal.py +++ b/run_modal.py @@ -7,6 +7,7 @@ ''' import os +import subprocess os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" import sys import modal @@ -21,18 +22,26 @@ # turn off diffusers telemetry until I can figure out how to make it opt-in os.environ['DISABLE_TELEMETRY'] = 'YES' +# Load models in our volume, so we don't need to download them with huggingface-cli +model_volume = modal.Volume.from_name("models", create_if_missing=True) + # define the volume for storing model outputs, using "creating volumes lazily": https://modal.com/docs/guide/volumes # you will find your model, samples and optimizer stored in: https://modal.com/storage/your-username/main/flux-lora-models -model_volume = modal.Volume.from_name("flux-lora-models", create_if_missing=True) +trainings_volume = modal.Volume.from_name("trainings", create_if_missing=True) # modal_output, due to "cannot mount volume on non-empty path" requirement -MOUNT_DIR = "/root/ai-toolkit/modal_output" # modal_output, due to "cannot mount volume on non-empty path" requirement +MODELS_MOUNT_DIR = "/root/ai-toolkit/vol/models" # modal_output, due to "cannot mount volume on non-empty path" requirement +TRAIN_MOUNT_DIR = "/root/ai-toolkit/vol/trainings" + +# Get the current directory where this script is located (ai-toolkit directory) +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) # define modal app image = ( - modal.Image.debian_slim(python_version="3.11") + modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.11") # install required system and pip packages, more about this modal approach: https://modal.com/docs/examples/dreambooth_app .apt_install("libgl1", "libglib2.0-0") + .pip_install("cupy-cuda12x") .pip_install( "python-dotenv", "torch", @@ -69,14 +78,15 @@ "huggingface_hub", "peft" ) + # mount for the entire ai-toolkit directory + # dynamically use the current directory where this script is located + .add_local_dir(CURRENT_DIR, remote_path="/root/ai-toolkit") ) -# mount for the entire ai-toolkit directory -# example: "/Users/username/ai-toolkit" is the local directory, "/root/ai-toolkit" is the remote directory -code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit") + # create the Modal app with the necessary mounts and volumes -app = modal.App(name="flux-lora-training", image=image, mounts=[code_mount], volumes={MOUNT_DIR: model_volume}) +app = modal.App(name="ostris-ai-toolkit", image=image, volumes={MODELS_MOUNT_DIR: model_volume, TRAIN_MOUNT_DIR: trainings_volume}) # Check if we have DEBUG_TOOLKIT in env if os.environ.get("DEBUG_TOOLKIT", "0") == "1": @@ -104,28 +114,36 @@ def print_end_message(jobs_completed, jobs_failed): @app.function( # request a GPU with at least 24GB VRAM # more about modal GPU's: https://modal.com/docs/guide/gpu - gpu="A100", # gpu="H100" + gpu="H100", # gpu="H100" # more about modal timeouts: https://modal.com/docs/guide/timeouts - timeout=7200 # 2 hours, increase or decrease if needed + timeout=7200, # 2 hours, increase or decrease if needed ) -def main(config_file_list_str: str, recover: bool = False, name: str = None): +def main(config_id_list_str: str, recover: bool = False, name: str = None): # convert the config file list from a string to a list - config_file_list = config_file_list_str.split(",") + config_if_list = config_id_list_str.split(",") jobs_completed = 0 jobs_failed = 0 - print(f"Running {len(config_file_list)} job{'' if len(config_file_list) == 1 else 's'}") + print(f"Running {len(config_if_list)} job{'' if len(config_if_list) == 1 else 's'}") - for config_file in config_file_list: + for config_id in config_if_list: try: + config_folder = f"{TRAIN_MOUNT_DIR}/{config_id}" + config_file = f"{config_folder}/config.yaml" job = get_job(config_file, name) - job.config['process'][0]['training_folder'] = MOUNT_DIR - os.makedirs(MOUNT_DIR, exist_ok=True) - print(f"Training outputs will be saved to: {MOUNT_DIR}") + job.config['name'] = config_id + print(job.config['name']) + job.config['process'][0]['training_folder'] = f"{config_folder}/output" + print(job.config['process'][0]['training_folder']) + job.config['process'][0]['datasets'][0]['folder_path'] = f"{config_folder}/dataset" + print(job.config['process'][0]['datasets'][0]['folder_path']) + job.meta['name'] = config_id + + os.makedirs(config_folder, exist_ok=True) + print(f"Training outputs will be saved to: {config_folder}") - # run the job job.run() # commit the volume after training @@ -147,7 +165,7 @@ def main(config_file_list_str: str, recover: bool = False, name: str = None): # require at least one config file parser.add_argument( - 'config_file_list', + 'config_id_list_str', nargs='+', type=str, help='Name of config file (eg: person_v1 for config/person_v1.json/yaml), or full path if it is not in config folder, you can pass multiple config files and run them all sequentially' @@ -170,6 +188,6 @@ def main(config_file_list_str: str, recover: bool = False, name: str = None): args = parser.parse_args() # convert list of config files to a comma-separated string for Modal compatibility - config_file_list_str = ",".join(args.config_file_list) + config_id_list_str = ",".join(args.config_id_list_str) - main.call(config_file_list_str=config_file_list_str, recover=args.recover, name=args.name) + main.call(config_id_list_str=config_id_list_str, recover=args.recover, name=args.name) \ No newline at end of file