Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions go/cmd/dolt/commands/tblcmds/import.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,56 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError {
return nil
}

// validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema
// considering any name mapping that may be applied
func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error {
if len(primaryKeys) == 0 {
return nil
}

cols := rdSchema.GetAllCols()
var missingKeys []string

for _, pk := range primaryKeys {
// First check if the primary key exists directly in the schema
if _, ok := cols.GetByName(pk); ok {
continue
}

// If not found directly, check if any column maps to this primary key name
found := false
cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
if nameMapper.Map(col.Name) == pk {
found = true
return true, nil
}
return false, nil
})

if !found {
missingKeys = append(missingKeys, pk)
}
}

if len(missingKeys) > 0 {
// Build list of available columns for helpful error message
var availableCols []string
cols.Iter(func(tag uint64, col schema.Column) (bool, error) {
availableCols = append(availableCols, col.Name)
return false, nil
})

if len(missingKeys) == 1 {
return fmt.Errorf("primary key '%s' not found in import file. Available columns: %s",
missingKeys[0], strings.Join(availableCols, ", "))
}
return fmt.Errorf("primary keys %v not found in import file. Available columns: %s",
missingKeys, strings.Join(availableCols, ", "))
}

return nil
}

type ImportCmd struct{}

// Name is returns the name of the Dolt cli command. This is what is used on the command line to invoke the command
Expand Down Expand Up @@ -529,6 +579,15 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D
return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.CreateReaderErr, Cause: err}
}

// Validate primary keys early for create operations, so that we return validation errors early
if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" {
rdSchema := rd.GetSchema()
if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil {
rd.Close(ctx)
return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err}
}
}

return rd, nil
}

Expand Down
191 changes: 191 additions & 0 deletions go/cmd/dolt/commands/tblcmds/import_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// Copyright 2025 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tblcmds

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
)

func TestValidatePrimaryKeysAgainstSchema(t *testing.T) {
// Create a test schema with columns: id, name, email, age
testCols := []schema.Column{
{Name: "id", Tag: 0, IsPartOfPK: true},
{Name: "name", Tag: 1, IsPartOfPK: false},
{Name: "email", Tag: 2, IsPartOfPK: false},
{Name: "age", Tag: 3, IsPartOfPK: false},
}
testSchema, err := schema.NewSchema(
schema.NewColCollection(testCols...),
nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK
schema.Collation_Default,
nil, // indexes
nil, // checks
)
require.NoError(t, err)

tests := []struct {
name string
primaryKeys []string
schema schema.Schema
expectError bool
errorContains string
}{
{
name: "valid single primary key",
primaryKeys: []string{"id"},
schema: testSchema,
expectError: false,
},
{
name: "valid multiple primary keys",
primaryKeys: []string{"id", "name"},
schema: testSchema,
expectError: false,
},
{
name: "empty primary keys",
primaryKeys: []string{},
schema: testSchema,
expectError: false,
},
{
name: "invalid single primary key",
primaryKeys: []string{"invalid_col"},
schema: testSchema,
expectError: true,
errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age",
},
{
name: "mix of valid and invalid primary keys",
primaryKeys: []string{"id", "invalid_col1", "name", "invalid_col2"},
schema: testSchema,
expectError: true,
errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age",
},
{
name: "all invalid primary keys",
primaryKeys: []string{"col1", "col2", "col3"},
schema: testSchema,
expectError: true,
errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{})

if tt.expectError {
assert.Error(t, err)
if tt.errorContains != "" {
assert.Contains(t, err.Error(), tt.errorContains)
}
} else {
assert.NoError(t, err)
}
})
}
}

func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) {
// Test that available columns are listed in a consistent order
testCols := []schema.Column{
{Name: "zebra", Tag: 3, IsPartOfPK: false},
{Name: "alpha", Tag: 0, IsPartOfPK: true},
{Name: "beta", Tag: 1, IsPartOfPK: false},
{Name: "gamma", Tag: 2, IsPartOfPK: false},
}
testSchema, err := schema.NewSchema(
schema.NewColCollection(testCols...),
nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK
schema.Collation_Default,
nil, // indexes
nil, // checks
)
require.NoError(t, err)

err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{})
assert.Error(t, err)
// The implementation returns a detailed error with available columns
assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma")
}

func TestValidatePrimaryKeysWithNameMapping(t *testing.T) {
// Create a test schema with columns: user_id, user_name, user_email
testCols := []schema.Column{
{Name: "user_id", Tag: 0, IsPartOfPK: true},
{Name: "user_name", Tag: 1, IsPartOfPK: false},
{Name: "user_email", Tag: 2, IsPartOfPK: false},
}
testSchema, err := schema.NewSchema(
schema.NewColCollection(testCols...),
nil,
schema.Collation_Default,
nil,
nil,
)
require.NoError(t, err)

// Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email
nameMapper := rowconv.NameMapper{
"user_id": "id",
"user_name": "name",
"user_email": "email",
}

tests := []struct {
name string
primaryKeys []string
expectError bool
}{
{
name: "primary key using mapped name",
primaryKeys: []string{"id"}, // mapped from user_id
expectError: false,
},
{
name: "primary key using original name",
primaryKeys: []string{"user_id"}, // original column name
expectError: false,
},
{
name: "multiple primary keys with mixed names",
primaryKeys: []string{"id", "name"}, // mapped names
expectError: false,
},
{
name: "invalid primary key not in mapping",
primaryKeys: []string{"invalid_col"},
expectError: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper)
if tt.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}
108 changes: 106 additions & 2 deletions integration-tests/bats/import-create-tables.bats
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,16 @@ CSV
run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "Error determining the output schema." ]] || false
[[ "$output" =~ "column 'batmansparents' not found" ]] || false
[[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
[[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
}

@test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" {
run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "Error determining the output schema." ]] || false
[[ "$output" =~ "column 'batmansparents' not found" ]] || false
[[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
[[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
}

@test "import-create-tables: create a table with two primary keys from csv import" {
Expand Down Expand Up @@ -947,4 +949,106 @@ DELIM
[[ "$output" =~ '5,contains null,"[4,null]"' ]] || false
[[ "$output" =~ '6,empty,[]' ]] || false

}

# See: https://github.com/dolthub/dolt/issues/1083
@test "import-create-tables: validate primary keys exist in CSV file" {
# Create a test CSV file
cat <<DELIM > test_pk_validation.csv
id,name,email,age
1,Alice,[email protected],30
2,Bob,[email protected],25
3,Charlie,[email protected],35
DELIM

# Test 1: Invalid single primary key should fail immediately
run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false

# Test 2: Multiple invalid primary keys should fail immediately
run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary keys" ]] || false
[[ "$output" =~ "invalid1 invalid2" ]] || false
[[ "$output" =~ "not found in import file" ]] || false

# Test 3: Mix of valid and invalid primary keys should fail
run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false

# Test 4: Valid primary key should succeed
run dolt table import -c --pk "id" test_table test_pk_validation.csv
[ "$status" -eq 0 ]

# Verify table was created correctly
run dolt sql -q "DESCRIBE test_table;"
[ "$status" -eq 0 ]
[[ "$output" =~ "id" ]] || false
[[ "$output" =~ "PRI" ]] || false

# Test 5: Valid multiple primary keys should succeed
run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
[ "$status" -eq 0 ]

# Test 6: PSV file with invalid primary key should also fail immediately
cat <<DELIM > test_pk_validation.psv
id|name|email|age
1|Alice|[email protected]|30
2|Bob|[email protected]|25
3|Charlie|[email protected]|35
DELIM

run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false

# Test 7: Large CSV should fail quickly (not after reading entire file)
# Create a larger CSV to simulate the original issue
{
echo "year,state_fips,county_fips,precinct,candidate,votes"
for i in {1..1000}; do
echo "2020,$i,$i,precinct$i,candidate$i,$i"
done
} > large_test.csv

# Time the command - it should fail immediately, not after processing all rows
start_time=$(date +%s)
run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
end_time=$(date +%s)
duration=$((end_time - start_time))

[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
# Should fail in less than 2 seconds (immediate validation)
[ "$duration" -lt 2 ] || false
}

@test "import-create-tables: primary key validation with schema file should skip validation" {
# Create a test CSV file
cat <<DELIM > test_data.csv
id,name,email
1,Alice,[email protected]
2,Bob,[email protected]
DELIM

# Create a schema file with different column as primary key
cat <<SQL > test_schema.sql
CREATE TABLE test_with_schema (
id INT,
name VARCHAR(100),
email VARCHAR(100),
PRIMARY KEY (name)
);
SQL

# When schema file is provided, it should work without primary key validation
run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv
[ "$status" -eq 0 ]

# Verify that 'name' is the primary key (from schema file)
run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
[ "$status" -eq 0 ]
[[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
}
1 change: 1 addition & 0 deletions integration-tests/bats/import-tables.bats
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ teardown() {
dolt table import -r -pk '`Key4' t `batshelper escaped-characters.csv`
dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv`
}

Loading