-
Notifications
You must be signed in to change notification settings - Fork 126
APP-10775: viam training-script test-local #5524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
fdff64b
first pass
vpandiarajan20 fb04bd4
security and linting
vpandiarajan20 214911d
descrip text
vpandiarajan20 9c22fec
Merge branch 'viamrobotics:main' into APP-10775
vpandiarajan20 c76a430
dataset paths
vpandiarajan20 a8f8f8d
CLI help text
vpandiarajan20 3f4da57
lint
vpandiarajan20 e28a0d1
clean up
vpandiarajan20 753882e
Merge branch 'main' into APP-10775
vpandiarajan20 390ce70
Merge branch 'main' into APP-10775
vpandiarajan20 704f696
warning message
vpandiarajan20 1fad4ba
warning message 2
vpandiarajan20 001c886
change help text
vpandiarajan20 994b80e
remove validation
vpandiarajan20 c9d36d7
lint
vpandiarajan20 b6efdeb
to make windows work
vpandiarajan20 916fa04
Merge branch 'viamrobotics:main' into APP-10775
vpandiarajan20 4bc5312
help text
vpandiarajan20 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
dataset paths
- Loading branch information
commit c76a430d2a3d7a0ee436f19bcd599bdbc5e6cd35
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,7 @@ const ( | |
| // Flags for test-local command | ||
| trainFlagContainerVersion = "container-version" | ||
| trainFlagDatasetFile = "dataset-file" | ||
| trainFlagDatasetRoot = "dataset-root" | ||
| trainFlagModelOutputDirectory = "model-output-directory" | ||
| trainFlagCustomArgs = "custom-args" | ||
| trainFlagTrainingScriptDirectory = "training-script-directory" | ||
|
|
@@ -617,6 +618,7 @@ func convertVisibilityToProto(visibility string) (*v1.Visibility, error) { | |
| type mlTrainingScriptTestLocalArgs struct { | ||
| ContainerVersion string | ||
| DatasetFile string | ||
| DatasetRoot string | ||
| ModelOutputDirectory string | ||
| TrainingScriptDirectory string | ||
| CustomArgs []string | ||
|
|
@@ -634,21 +636,37 @@ func MLTrainingScriptTestLocalAction(c *cli.Context, args mlTrainingScriptTestLo | |
| return err | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For example, just adding a "Docker is available. Using image BLAH" |
||
|
|
||
| // Get absolute path for dataset root directory | ||
| datasetRootAbs, err := filepath.Abs(args.DatasetRoot) | ||
| if err != nil { | ||
| return errors.Wrapf(err, "failed to get absolute path for dataset root directory") | ||
| } | ||
|
|
||
| // Dataset file is relative to dataset root (defaults to "dataset.jsonl") | ||
| datasetFileRelative := args.DatasetFile | ||
| if datasetFileRelative == "" { | ||
| datasetFileRelative = "dataset.jsonl" | ||
| } | ||
|
|
||
| // Clean the path to normalize it (remove .., ., extra slashes, etc.) | ||
| datasetFileRelative = filepath.Clean(datasetFileRelative) | ||
|
|
||
| // Ensure the path doesn't try to escape the dataset root | ||
| if strings.HasPrefix(datasetFileRelative, "..") || filepath.IsAbs(datasetFileRelative) { | ||
| return errors.Errorf("dataset file path must be relative to dataset root and cannot escape it: %s", datasetFileRelative) | ||
| } | ||
|
|
||
| // Validate required paths exist | ||
| if err := validateLocalTrainingPaths(args.TrainingScriptDirectory, args.DatasetFile); err != nil { | ||
| if err := validateLocalTrainingPaths(args.TrainingScriptDirectory, datasetRootAbs, filepath.Join(datasetRootAbs, datasetFileRelative)); err != nil { | ||
| return err | ||
| } | ||
| // Get absolute paths for volume mounting | ||
|
|
||
| // Get absolute path for training script directory | ||
| scriptDirAbs, err := filepath.Abs(args.TrainingScriptDirectory) | ||
| if err != nil { | ||
| return errors.Wrapf(err, "failed to get absolute path for training script directory") | ||
| } | ||
|
|
||
| datasetFileAbs, err := filepath.Abs(args.DatasetFile) | ||
| if err != nil { | ||
| return errors.Wrapf(err, "failed to get absolute path for dataset file") | ||
| } | ||
|
|
||
| // Determine output directory (default to current directory if not specified) | ||
| outputDir := args.ModelOutputDirectory | ||
| if outputDir == "" { | ||
|
|
@@ -665,7 +683,7 @@ func MLTrainingScriptTestLocalAction(c *cli.Context, args mlTrainingScriptTestLo | |
| } | ||
|
|
||
| // Create temporary script to run inside container | ||
| scriptContent, err := createTrainingScript(args.CustomArgs) | ||
| scriptContent, err := createTrainingScript(args.CustomArgs, datasetFileRelative) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -706,10 +724,10 @@ func MLTrainingScriptTestLocalAction(c *cli.Context, args mlTrainingScriptTestLo | |
| "--platform", "linux/x86_64", | ||
| "--rm", | ||
| "-v", fmt.Sprintf("%s:/training_script", scriptDirAbs), | ||
| "-v", fmt.Sprintf("%s:/dataset.jsonl", datasetFileAbs), | ||
| "-v", fmt.Sprintf("%s:/dataset_root", datasetRootAbs), | ||
| "-v", fmt.Sprintf("%s:/model_output", outputDirAbs), | ||
| "-v", fmt.Sprintf("%s:/run_training.sh", tmpScript.Name()), | ||
| "-w", "/training_script", | ||
| "-w", "/dataset_root", // Set working directory to dataset root so relative paths resolve correctly | ||
| containerImageURI, | ||
| "/run_training.sh", | ||
| } | ||
|
|
@@ -775,7 +793,7 @@ func checkDockerAvailable() error { | |
| } | ||
|
|
||
| // validateLocalTrainingPaths validates that required paths exist. | ||
| func validateLocalTrainingPaths(scriptDir, datasetFile string) error { | ||
| func validateLocalTrainingPaths(scriptDir, datasetRoot, datasetFile string) error { | ||
| // Check training script directory exists | ||
| if _, err := os.Stat(scriptDir); os.IsNotExist(err) { | ||
| return errors.Errorf("training script directory does not exist: %s", scriptDir) | ||
|
|
@@ -792,6 +810,17 @@ func validateLocalTrainingPaths(scriptDir, datasetFile string) error { | |
| return errors.Errorf("model/training.py not found in training script directory: %s", scriptDir) | ||
| } | ||
|
|
||
| // Check dataset root directory exists | ||
| if _, err := os.Stat(datasetRoot); os.IsNotExist(err) { | ||
| return errors.Errorf("dataset root directory does not exist: %s", datasetRoot) | ||
| } | ||
|
|
||
| // Check that the data folder exists | ||
| dataFolderPath := filepath.Join(datasetRoot, "data") | ||
| if _, err := os.Stat(dataFolderPath); os.IsNotExist(err) { | ||
| return errors.Errorf("data folder does not exist in dataset root directory: %s", datasetRoot) | ||
| } | ||
|
|
||
| // Check dataset file exists | ||
| if _, err := os.Stat(datasetFile); os.IsNotExist(err) { | ||
| return errors.Errorf("dataset file does not exist: %s", datasetFile) | ||
|
|
@@ -801,7 +830,8 @@ func validateLocalTrainingPaths(scriptDir, datasetFile string) error { | |
| } | ||
|
|
||
| // createTrainingScript creates the shell script content to run inside the container. | ||
| func createTrainingScript(customArgs []string) (string, error) { | ||
| // datasetFileRelative is the path to the dataset file relative to the dataset root (which is the CWD). | ||
| func createTrainingScript(customArgs []string, datasetFileRelative string) (string, error) { | ||
| // Validate custom arguments format (key=value) before building script | ||
| for _, arg := range customArgs { | ||
| if !strings.Contains(arg, "=") { | ||
|
|
@@ -822,12 +852,13 @@ func createTrainingScript(customArgs []string) (string, error) { | |
| script.WriteString("#!/bin/bash\n") | ||
| script.WriteString("set -e\n\n") | ||
| script.WriteString("echo \"Installing training script package...\"\n") | ||
etai-shuchatowitz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| script.WriteString("pip3 install --no-cache-dir .\n\n") | ||
| script.WriteString("pip3 install --no-cache-dir /training_script\n\n") | ||
| script.WriteString("echo \"Running training...\"\n") | ||
|
|
||
| // Build Python training command | ||
| script.WriteString("python3 -m model.training") | ||
| script.WriteString(" --dataset_file=/dataset.jsonl") | ||
| script.WriteString(" --dataset_file=") | ||
| script.WriteString(datasetFileRelative) | ||
| script.WriteString(" --model_output_directory=/model_output") | ||
|
|
||
| // Add custom arguments | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe this goes in documentation? I also want to add that if the containers really slow, it could be because the dataset root or training script directory has a bunch of extra files. Apparently mounting volumes can be expensive.