@@ -160,6 +160,14 @@ class _HtmlFrameParser(object):
160160 attrs : dict
161161 List of HTML <table> element attributes to match.
162162
163+ encoding : str
164+ Encoding to be used by parser
165+
166+ displayed_only : bool
167+ Whether or not items with "display:none" should be ignored
168+
169+ .. versionadded:: 0.23.0
170+
163171 Attributes
164172 ----------
165173 io : str or file-like
@@ -172,6 +180,14 @@ class _HtmlFrameParser(object):
172180 A dictionary of valid table attributes to use to search for table
173181 elements.
174182
183+ encoding : str
184+ Encoding to be used by parser
185+
186+ displayed_only : bool
187+ Whether or not items with "display:none" should be ignored
188+
189+ .. versionadded:: 0.23.0
190+
175191 Notes
176192 -----
177193 To subclass this class effectively you must override the following methods:
@@ -187,11 +203,12 @@ class _HtmlFrameParser(object):
187203 functionality.
188204 """
189205
190- def __init__ (self , io , match , attrs , encoding ):
206+ def __init__ (self , io , match , attrs , encoding , displayed_only ):
191207 self .io = io
192208 self .match = match
193209 self .attrs = attrs
194210 self .encoding = encoding
211+ self .displayed_only = displayed_only
195212
196213 def parse_tables (self ):
197214 tables = self ._parse_tables (self ._build_doc (), self .match , self .attrs )
@@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table):
380397 res = self ._parse_tr (table )
381398 return self ._parse_raw_data (res )
382399
400+ def _handle_hidden_tables (self , tbl_list , attr_name ):
401+ """Returns list of tables, potentially removing hidden elements
402+
403+ Parameters
404+ ----------
405+ tbl_list : list of Tag or list of Element
406+ Type of list elements will vary depending upon parser used
407+ attr_name : str
408+ Name of the accessor for retrieving HTML attributes
409+
410+ Returns
411+ -------
412+ list of Tag or list of Element
413+ Return type matches `tbl_list`
414+ """
415+ if not self .displayed_only :
416+ return tbl_list
417+
418+ return [x for x in tbl_list if "display:none" not in
419+ getattr (x , attr_name ).get ('style' , '' ).replace (" " , "" )]
420+
383421
384422class _BeautifulSoupHtml5LibFrameParser (_HtmlFrameParser ):
385423 """HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs):
431469
432470 result = []
433471 unique_tables = set ()
472+ tables = self ._handle_hidden_tables (tables , "attrs" )
434473
435474 for table in tables :
475+ if self .displayed_only :
476+ for elem in table .find_all (
477+ style = re .compile (r"display:\s*none" )):
478+ elem .decompose ()
479+
436480 if (table not in unique_tables and
437481 table .find (text = match ) is not None ):
438482 result .append (table )
@@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs):
528572
529573 tables = doc .xpath (xpath_expr , namespaces = _re_namespace )
530574
575+ tables = self ._handle_hidden_tables (tables , "attrib" )
576+ if self .displayed_only :
577+ for table in tables :
578+ # lxml utilizes XPATH 1.0 which does not have regex
579+ # support. As a result, we find all elements with a style
580+ # attribute and iterate them to check for display:none
581+ for elem in table .xpath ('.//*[@style]' ):
582+ if "display:none" in elem .attrib .get (
583+ "style" , "" ).replace (" " , "" ):
584+ elem .getparent ().remove (elem )
585+
531586 if not tables :
532587 raise ValueError ("No tables found matching regex {patt!r}"
533588 .format (patt = pattern ))
@@ -729,15 +784,15 @@ def _validate_flavor(flavor):
729784 return flavor
730785
731786
732- def _parse (flavor , io , match , attrs , encoding , ** kwargs ):
787+ def _parse (flavor , io , match , attrs , encoding , displayed_only , ** kwargs ):
733788 flavor = _validate_flavor (flavor )
734789 compiled_match = re .compile (match ) # you can pass a compiled regex here
735790
736791 # hack around python 3 deleting the exception variable
737792 retained = None
738793 for flav in flavor :
739794 parser = _parser_dispatch (flav )
740- p = parser (io , compiled_match , attrs , encoding )
795+ p = parser (io , compiled_match , attrs , encoding , displayed_only )
741796
742797 try :
743798 tables = p .parse_tables ()
@@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
773828 skiprows = None , attrs = None , parse_dates = False ,
774829 tupleize_cols = None , thousands = ',' , encoding = None ,
775830 decimal = '.' , converters = None , na_values = None ,
776- keep_default_na = True ):
831+ keep_default_na = True , displayed_only = True ):
777832 r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
778833
779834 Parameters
@@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
877932
878933 .. versionadded:: 0.19.0
879934
935+ display_only : bool, default True
936+ Whether elements with "display: none" should be parsed
937+
938+ .. versionadded:: 0.23.0
939+
880940 Returns
881941 -------
882942 dfs : list of DataFrames
@@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
924984 parse_dates = parse_dates , tupleize_cols = tupleize_cols ,
925985 thousands = thousands , attrs = attrs , encoding = encoding ,
926986 decimal = decimal , converters = converters , na_values = na_values ,
927- keep_default_na = keep_default_na )
987+ keep_default_na = keep_default_na ,
988+ displayed_only = displayed_only )
0 commit comments