diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 64baa69be4af31..afaf464dbcf420 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -295,6 +295,11 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. + Additionally, if all columns are found to be strings and have varying + lengths, the average length of all the strings becomes a crucial factor + in the determination process. + + .. note:: This method is a rough heuristic and may produce both false positives and diff --git a/Lib/csv.py b/Lib/csv.py index 77f30c8d2b1f61..e58160569e91fa 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -394,6 +394,8 @@ def has_header(self, sample): # can't be determined, it is assumed to be a string in which case # the length of the string is the determining factor: if all of the # rows except for the first are the same length, it's a header. + # When the strings have varying length, the average length of all + # strings becomes a determining factor. # Finally, a 'vote' is taken at the end for each column, adding or # subtracting from the likelihood of the first row being a header. @@ -402,8 +404,9 @@ def has_header(self, sample): header = next(rdr) # assume first row is header columns = len(header) - columnTypes = {} - for i in range(columns): columnTypes[i] = None + columnTypes = {i: None for i in range(columns)} + average_size = 0 + col_are_strings = True checked = 0 for row in rdr: @@ -416,6 +419,10 @@ def has_header(self, sample): continue # skip rows that have irregular number of columns for col in list(columnTypes.keys()): + # check if all col are strings + if row[col].isnumeric(): + col_are_strings = False + thisType = complex try: thisType(row[col]) @@ -426,6 +433,7 @@ def has_header(self, sample): if thisType != columnTypes[col]: if columnTypes[col] is None: # add new column type columnTypes[col] = thisType + average_size += len(row[col]) else: # type is inconsistent, remove column from # consideration @@ -434,6 +442,13 @@ def has_header(self, sample): # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 + + # special case when all columns are strings and columnTypes has been emptied + if not columnTypes and col_are_strings and columns > 0: + # If there are only columns of strings and no column types specified, + # update the dictionary to store the average length of all strings + columnTypes[0] = average_size // columns + for col, colType in columnTypes.items(): if isinstance(colType, int): # it's a length if len(header[col]) != colType: diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 6a4180e6d1b0a1..c238fdd4851449 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1188,6 +1188,18 @@ class TestSniffer(unittest.TestCase): abc\0def ghijkl\0mno ghi\0jkl +""" + sample15 = """\ +sample,fastq_1,fastq_2 +A1-35-8,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R1.fastq.gz,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R2.fastq.gz +A2-102-5,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R1.fastq.gz,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R2.fastq.gz +A5-35-17,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R1.fastq.gz,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R2.fastq.gz +AD1-7a,/mnt/scratch/sarek/data/AD1-7a/AD1-7a_R1.fastq.gz,/mnt/scratch/sarek/data/AD1-7a/AD1-7a_R2.fastq.gz +AD1-83a,/mnt/scratch/sarek/data/AD1-83a/AD1-83a_R1.fastq.gz,/mnt/scratch/sarek/data/AD1-83a/AD1-83a_R2.fastq.gz +AD2-60a,/mnt/scratch/sarek/data/AD2-60a/AD2-60a_R1.fastq.gz,/mnt/scratch/sarek/data/AD2-60a/AD2-60a_R2.fastq.gz +Arg1366,/mnt/scratch/sarek/data/Arg1366/Arg1366_R1.fastq.gz,/mnt/scratch/sarek/data/Arg1366/Arg1366_R2.fastq.gz +Br795,/mnt/scratch/sarek/data/Br795/Br795_R1.fastq.gz,/mnt/scratch/sarek/data/Br795/Br795_R2.fastq.gz +Bt100,/mnt/scratch/sarek/data/Bt100/Bt100_R1.fastq.gz,/mnt/scratch/sarek/data/Bt100/Bt100_R2.fastq.gz """ def test_issue43625(self): @@ -1200,6 +1212,7 @@ def test_has_header_strings(self): sniffer = csv.Sniffer() self.assertFalse(sniffer.has_header(self.sample10)) self.assertFalse(sniffer.has_header(self.sample11)) + self.assertTrue(sniffer.has_header(self.sample15)) def test_has_header(self): sniffer = csv.Sniffer() diff --git a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst new file mode 100644 index 00000000000000..88e89d9fcf943d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst @@ -0,0 +1 @@ +Fix false negatives in the :meth:`~csv.Sniffer.has_header` method of :class:`csv.Sniffer` when all cells in the first row are strings.