1- import decodeHTMLChars from './decodeHTMLChars ' ;
1+ import { AllHtmlEntities } from 'html-entities ' ;
22
3- function parseMeta ( html , url ) {
4- const metaTagRegex = / < m e t a [ ^ > ] * p r o p e r t y = [ ' " ] * o g : ( [ ^ ' " ] * ) [ ^ > ] * c o n t e n t = [ ' " ] ( [ ^ ' " ] * ) [ ' " ] [ ^ > ] * > / gi;
5- const meta = {
6- url : url ,
7- } ;
8- const matches = html . match ( metaTagRegex ) ;
3+ const entities = new AllHtmlEntities ( ) ;
4+
5+ function parseMeta ( html , url , options ) {
6+ const metaTagOGRegex = / < m e t a [ ^ > ] * p r o p e r t y = [ ' " ] * o g : ( [ ^ ' " ] * ) [ ^ > ] * c o n t e n t = [ ' " ] ( [ ^ ' " ] * ) [ ' " ] [ ^ > ] * > / gi;
7+ const metaPropertyRegex = / < m e t a [ ^ > ] * p r o p e r t y = [ ' " ] * o g : ( [ ^ ' " ] * ) [ ^ > ] * > / i;
8+ const metaContentRegex = / < m e t a [ ^ > ] * c o n t e n t = [ ' " ] ( [ ^ ' " ] * ) [ ^ > ] * > / i;
9+ const meta = { url } ;
10+
11+ const matches = html . match ( metaTagOGRegex ) ;
912
1013 if ( matches ) {
1114 for ( let i = matches . length ; i -- ; ) {
12- let metaName = matches [ i ] . split ( 'og:' ) ;
15+ let metaName ;
16+ let metaValue ;
1317
14- if ( metaName . length > 1 ) {
15- metaName = metaName [ 1 ] . split ( '"' ) ;
16- } else {
17- break ;
18- }
18+ try {
19+ const propertyMatch = metaPropertyRegex . exec ( matches [ i ] ) ;
20+ const contentMatch = metaContentRegex . exec ( matches [ i ] ) ;
21+ metaName = propertyMatch [ 1 ] . trim ( ) ;
22+ metaValue = contentMatch [ 1 ] . trim ( ) ;
1923
20- if ( metaName . length > 1 ) {
21- metaName = metaName [ 0 ] ;
22- } else {
23- metaName = metaName [ 0 ] . split ( "'" ) ;
24-
25- if ( metaName . length > 1 ) {
26- metaName = metaName [ 0 ] ;
27- } else {
28- break ;
24+ if ( ! metaName || ! metaValue ) {
25+ continue ;
26+ }
27+ } catch ( error ) {
28+ if ( __DEV__ ) {
29+ console . log ( 'Error on ' , matches [ i ] ) ;
30+ console . log ( 'propertyMatch' , propertyMatch ) ;
31+ console . log ( 'contentMatch' , contentMatch ) ;
32+ console . log ( error ) ;
2933 }
30- }
3134
32- let metaValue = matches [ i ] . split ( 'content=' ) ;
35+ continue ;
36+ }
3337
34- if ( metaValue . length > 1 ) {
35- metaValue = metaValue [ 1 ] . split ( metaValue [ 1 ] . trim ( ) [ 0 ] ) [ 1 ] ;
38+ if ( metaValue . length > 0 ) {
3639 if ( metaValue [ 0 ] === '/' ) {
3740 if ( url [ url . length - 1 ] === '/' ) {
3841 metaValue = url + metaValue . substring ( 1 ) ;
@@ -41,10 +44,20 @@ function parseMeta(html, url) {
4144 }
4245 }
4346 } else {
44- break ;
47+ continue ;
4548 }
4649
47- meta [ metaName ] = decodeHTMLChars ( metaValue ) ;
50+ meta [ metaName ] = entities . decode ( metaValue ) ;
51+ }
52+
53+ if ( options . fallbackOnHTMLTags ) {
54+ try {
55+ fallbackOnHTMLTags ( html , meta ) ;
56+ } catch ( error ) {
57+ if ( __DEV__ ) {
58+ console . log ( 'Error in fallback' , error ) ;
59+ }
60+ }
4861 }
4962
5063 return meta ;
@@ -53,11 +66,46 @@ function parseMeta(html, url) {
5366 }
5467}
5568
69+ function fallbackOnHTMLTags ( htmlContent , metaDataObject ) {
70+ if ( ! metaDataObject . description ) {
71+ const descriptionMetaTagRegex = / < m e t a [ ^ > ] * n a m e = [ ' " ] * d e s c r i p t i o n [ ^ ' " ] * [ ^ > ] * c o n t e n t = [ ' " ] ( [ ^ ' " ] * ) [ ' " ] [ ^ > ] * > / gi;
72+ const descriptionMatches = htmlContent . match ( descriptionMetaTagRegex ) ;
73+
74+ if ( descriptionMatches && descriptionMatches . length > 0 ) {
75+ const descriptionContentRegex = / < m e t a [ ^ > ] * n a m e = [ ' " ] * d e s c r i p t i o n [ ^ ' " ] * [ ^ > ] * c o n t e n t = [ ' " ] ( [ ^ ' " ] * ) [ ' " ] [ ^ > ] * > / i;
76+ const descriptionMatch = descriptionContentRegex . exec ( descriptionMatches [ 0 ] ) ;
77+
78+ if ( descriptionMatch ) {
79+ metaDataObject . description = descriptionMatch [ 1 ] . trim ( ) ;
80+ }
81+ }
82+ }
83+
84+ if ( ! metaDataObject . title ) {
85+ const titleMetaTagRegex = / < t i t l e > ( [ ^ < ] * ) < \/ t i t l e > / gi;
86+ const titleMatches = htmlContent . match ( titleMetaTagRegex ) ;
87+
88+ if ( titleMatches && titleMatches . length > 0 ) {
89+ const titleContentRegex = / < t i t l e > ( [ ^ < ] * ) < \/ t i t l e > / i;
90+ const titleMatch = titleContentRegex . exec ( titleMatches [ 0 ] ) ;
91+
92+ if ( titleMatch ) {
93+ metaDataObject . title = titleMatch [ 1 ] . trim ( ) ;
94+ }
95+ }
96+ }
97+ }
98+
5699async function fetchHtml ( urlToFetch ) {
57100 let result ;
58101
59102 try {
60- result = await fetch ( urlToFetch ) ;
103+ result = await fetch ( urlToFetch , {
104+ method : 'GET' ,
105+ headers : {
106+ "user-agent" : 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ,
107+ } ,
108+ } ) ;
61109
62110 if ( result . status >= 400 ) {
63111 throw result ;
@@ -66,14 +114,20 @@ async function fetchHtml(urlToFetch) {
66114 return result . text ( )
67115 . then ( ( resultParsed ) => ( resultParsed ) ) ;
68116 } catch ( responseOrError ) {
69- if ( responseOrError . message ) {
70- console . log ( responseOrError ) ;
117+ if ( responseOrError . message && __DEV__ ) {
118+ if ( responseOrError . message === 'Network request failed' ) {
119+ console . log ( urlToFetch , 'could not be fetched' ) ;
120+ } else {
121+ console . log ( responseOrError ) ;
122+ }
71123 return null ;
72124 }
73125
74126 return responseOrError . text ( )
75127 . then ( ( error ) => {
76- console . log ( 'An error has occured while fetching url content' , error ) ;
128+ if ( __DEV__ ) {
129+ console . log ( 'An error has occured while fetching url content' , error ) ;
130+ }
77131 return null ;
78132 } ) ;
79133 }
@@ -92,14 +146,16 @@ function getUrls(contentToMatch) {
92146 urlsToReturn . push ( `http://${ url } ` ) ;
93147 }
94148 } ) ;
95-
96- return urlsToReturn ;
97149 } else {
98- throw new Error ( 'Could not find an html link' ) ;
150+ if ( __DEV__ ) {
151+ console . log ( 'Could not find an html link' ) ;
152+ }
99153 }
154+
155+ return urlsToReturn ;
100156}
101157
102- async function extractMeta ( textContent = '' ) {
158+ async function extractMeta ( textContent = '' , options = { fallbackOnHTMLTags : true } ) {
103159 try {
104160 const urls = getUrls ( textContent ) ;
105161
@@ -110,7 +166,7 @@ async function extractMeta(textContent = '') {
110166 metaData = await fetchHtml ( urls [ i ] )
111167 . then (
112168 ( html ) => ( {
113- ...parseMeta ( html , urls [ i ] ) ,
169+ ...html ? parseMeta ( html , urls [ i ] , options ) : { } ,
114170 url : urls [ i ] ,
115171 } )
116172 ) ;
0 commit comments