dolthub · macneale4 · Jun 24, 2025 · Jun 13, 2025 · Jun 15, 2025 · Jun 24, 2025
diff --git a/go/cmd/dolt/commands/tblcmds/import.go b/go/cmd/dolt/commands/tblcmds/import.go
@@ -378,6 +378,56 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError {
 	return nil
 }
 
+// validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema
+// considering any name mapping that may be applied
+func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error {
+	if len(primaryKeys) == 0 {
+		return nil
+	}
+
+	cols := rdSchema.GetAllCols()
+	var missingKeys []string
+
+	for _, pk := range primaryKeys {
+		// First check if the primary key exists directly in the schema
+		if _, ok := cols.GetByName(pk); ok {
+			continue
+		}
+
+		// If not found directly, check if any column maps to this primary key name
+		found := false
+		cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
+			if nameMapper.Map(col.Name) == pk {
+				found = true
+				return true, nil
+			}
+			return false, nil
+		})
+
+		if !found {
+			missingKeys = append(missingKeys, pk)
+		}
+	}
+
+	if len(missingKeys) > 0 {
+		// Build list of available columns for helpful error message
+		var availableCols []string
+		cols.Iter(func(tag uint64, col schema.Column) (bool, error) {
+			availableCols = append(availableCols, col.Name)
+			return false, nil
+		})
+
+		if len(missingKeys) == 1 {
+			return fmt.Errorf("primary key '%s' not found in import file. Available columns: %s",
+				missingKeys[0], strings.Join(availableCols, ", "))
+		}
+		return fmt.Errorf("primary keys %v not found in import file. Available columns: %s",
+			missingKeys, strings.Join(availableCols, ", "))
+	}
+
+	return nil
+}
+
 type ImportCmd struct{}
 
 // Name is returns the name of the Dolt cli command. This is what is used on the command line to invoke the command
@@ -529,6 +579,15 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D
 		return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.CreateReaderErr, Cause: err}
 	}
 
+	// Validate primary keys early for create operations, so that we return validation errors early
+	if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" {
+		rdSchema := rd.GetSchema()
+		if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil {
+			rd.Close(ctx)
+			return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err}
+		}
+	}
+
 	return rd, nil
 }
 

diff --git a/go/cmd/dolt/commands/tblcmds/import_test.go b/go/cmd/dolt/commands/tblcmds/import_test.go
@@ -0,0 +1,191 @@
+// Copyright 2025 Dolthub, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tblcmds
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
+	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
+)
+
+func TestValidatePrimaryKeysAgainstSchema(t *testing.T) {
+	// Create a test schema with columns: id, name, email, age
+	testCols := []schema.Column{
+		{Name: "id", Tag: 0, IsPartOfPK: true},
+		{Name: "name", Tag: 1, IsPartOfPK: false},
+		{Name: "email", Tag: 2, IsPartOfPK: false},
+		{Name: "age", Tag: 3, IsPartOfPK: false},
+	}
+	testSchema, err := schema.NewSchema(
+		schema.NewColCollection(testCols...),
+		nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK
+		schema.Collation_Default,
+		nil, // indexes
+		nil, // checks
+	)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name          string
+		primaryKeys   []string
+		schema        schema.Schema
+		expectError   bool
+		errorContains string
+	}{
+		{
+			name:        "valid single primary key",
+			primaryKeys: []string{"id"},
+			schema:      testSchema,
+			expectError: false,
+		},
+		{
+			name:        "valid multiple primary keys",
+			primaryKeys: []string{"id", "name"},
+			schema:      testSchema,
+			expectError: false,
+		},
+		{
+			name:        "empty primary keys",
+			primaryKeys: []string{},
+			schema:      testSchema,
+			expectError: false,
+		},
+		{
+			name:          "invalid single primary key",
+			primaryKeys:   []string{"invalid_col"},
+			schema:        testSchema,
+			expectError:   true,
+			errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age",
+		},
+		{
+			name:          "mix of valid and invalid primary keys",
+			primaryKeys:   []string{"id", "invalid_col1", "name", "invalid_col2"},
+			schema:        testSchema,
+			expectError:   true,
+			errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age",
+		},
+		{
+			name:          "all invalid primary keys",
+			primaryKeys:   []string{"col1", "col2", "col3"},
+			schema:        testSchema,
+			expectError:   true,
+			errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{})
+
+			if tt.expectError {
+				assert.Error(t, err)
+				if tt.errorContains != "" {
+					assert.Contains(t, err.Error(), tt.errorContains)
+				}
+			} else {
+				assert.NoError(t, err)
+			}
+		})
+	}
+}
+
+func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) {
+	// Test that available columns are listed in a consistent order
+	testCols := []schema.Column{
+		{Name: "zebra", Tag: 3, IsPartOfPK: false},
+		{Name: "alpha", Tag: 0, IsPartOfPK: true},
+		{Name: "beta", Tag: 1, IsPartOfPK: false},
+		{Name: "gamma", Tag: 2, IsPartOfPK: false},
+	}
+	testSchema, err := schema.NewSchema(
+		schema.NewColCollection(testCols...),
+		nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK
+		schema.Collation_Default,
+		nil, // indexes
+		nil, // checks
+	)
+	require.NoError(t, err)
+
+	err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{})
+	assert.Error(t, err)
+	// The implementation returns a detailed error with available columns
+	assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma")
+}
+
+func TestValidatePrimaryKeysWithNameMapping(t *testing.T) {
+	// Create a test schema with columns: user_id, user_name, user_email
+	testCols := []schema.Column{
+		{Name: "user_id", Tag: 0, IsPartOfPK: true},
+		{Name: "user_name", Tag: 1, IsPartOfPK: false},
+		{Name: "user_email", Tag: 2, IsPartOfPK: false},
+	}
+	testSchema, err := schema.NewSchema(
+		schema.NewColCollection(testCols...),
+		nil,
+		schema.Collation_Default,
+		nil,
+		nil,
+	)
+	require.NoError(t, err)
+
+	// Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email
+	nameMapper := rowconv.NameMapper{
+		"user_id":    "id",
+		"user_name":  "name",
+		"user_email": "email",
+	}
+
+	tests := []struct {
+		name        string
+		primaryKeys []string
+		expectError bool
+	}{
+		{
+			name:        "primary key using mapped name",
+			primaryKeys: []string{"id"}, // mapped from user_id
+			expectError: false,
+		},
+		{
+			name:        "primary key using original name",
+			primaryKeys: []string{"user_id"}, // original column name
+			expectError: false,
+		},
+		{
+			name:        "multiple primary keys with mixed names",
+			primaryKeys: []string{"id", "name"}, // mapped names
+			expectError: false,
+		},
+		{
+			name:        "invalid primary key not in mapping",
+			primaryKeys: []string{"invalid_col"},
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper)
+			if tt.expectError {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+			}
+		})
+	}
+}
diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats
@@ -254,14 +254,16 @@ CSV
     run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv
     [ "$status" -eq 1 ]
     [[ "$output" =~ "Error determining the output schema." ]] || false
-    [[ "$output" =~ "column 'batmansparents' not found" ]] || false
+    [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
+    [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
 }
 
 @test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" {
     run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv
     [ "$status" -eq 1 ]
     [[ "$output" =~ "Error determining the output schema." ]] || false
-    [[ "$output" =~ "column 'batmansparents' not found" ]] || false
+    [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
+    [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
 }
 
 @test "import-create-tables: create a table with two primary keys from csv import" {
@@ -947,4 +949,106 @@ DELIM
   [[ "$output" =~ '5,contains null,"[4,null]"' ]] || false
   [[ "$output" =~ '6,empty,[]' ]] || false
 
+}
+
+# See: https://github.com/dolthub/dolt/issues/1083
+@test "import-create-tables: validate primary keys exist in CSV file" {
+    # Create a test CSV file
+    cat <<DELIM > test_pk_validation.csv
+id,name,email,age
+1,Alice,[email protected],30
+2,Bob,[email protected],25
+3,Charlie,[email protected],35
+DELIM
+
+    # Test 1: Invalid single primary key should fail immediately
+    run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
+
+    # Test 2: Multiple invalid primary keys should fail immediately
+    run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary keys" ]] || false
+    [[ "$output" =~ "invalid1 invalid2" ]] || false
+    [[ "$output" =~ "not found in import file" ]] || false
+
+    # Test 3: Mix of valid and invalid primary keys should fail
+    run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false
+
+    # Test 4: Valid primary key should succeed
+    run dolt table import -c --pk "id" test_table test_pk_validation.csv
+    [ "$status" -eq 0 ]
+
+    # Verify table was created correctly
+    run dolt sql -q "DESCRIBE test_table;"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "id" ]] || false
+    [[ "$output" =~ "PRI" ]] || false
+
+    # Test 5: Valid multiple primary keys should succeed
+    run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
+    [ "$status" -eq 0 ]
+
+    # Test 6: PSV file with invalid primary key should also fail immediately
+    cat <<DELIM > test_pk_validation.psv
+id|name|email|age
+1|Alice|[email protected]|30
+2|Bob|[email protected]|25
+3|Charlie|[email protected]|35
+DELIM
+
+    run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false
+
+    # Test 7: Large CSV should fail quickly (not after reading entire file)
+    # Create a larger CSV to simulate the original issue
+    {
+        echo "year,state_fips,county_fips,precinct,candidate,votes"
+        for i in {1..1000}; do
+            echo "2020,$i,$i,precinct$i,candidate$i,$i"
+        done
+    } > large_test.csv
+
+    # Time the command - it should fail immediately, not after processing all rows
+    start_time=$(date +%s)
+    run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
+    end_time=$(date +%s)
+    duration=$((end_time - start_time))
+
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
+    # Should fail in less than 2 seconds (immediate validation)
+    [ "$duration" -lt 2 ] || false
+}
+
+@test "import-create-tables: primary key validation with schema file should skip validation" {
+    # Create a test CSV file
+    cat <<DELIM > test_data.csv
+id,name,email
+1,Alice,[email protected]
+2,Bob,[email protected]
+DELIM
+
+    # Create a schema file with different column as primary key
+    cat <<SQL > test_schema.sql
+CREATE TABLE test_with_schema (
+  id INT,
+  name VARCHAR(100),
+  email VARCHAR(100),
+  PRIMARY KEY (name)
+);
+SQL
+
+    # When schema file is provided, it should work without primary key validation
+    run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv
+    [ "$status" -eq 0 ]
+
+    # Verify that 'name' is the primary key (from schema file)
+    run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
 }
diff --git a/integration-tests/bats/import-tables.bats b/integration-tests/bats/import-tables.bats
@@ -47,3 +47,4 @@ teardown() {
     dolt table import -r -pk '`Key4' t `batshelper escaped-characters.csv`
     dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv`
 }
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,4 @@ teardown() {
		dolt table import -r -pk '`Key4' t `batshelper escaped-characters.csv`
		dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv`
		}