1- from requests_html import HTMLSession
2- from bs4 import BeautifulSoup as bs
1+ import requests
2+ from bs4 import BeautifulSoup
33import re
44import json
5-
6- # init session
7- session = HTMLSession ()
8-
5+ import argparse
96
107def get_video_info (url ):
11- # download HTML code
12- response = session .get (url )
13- # execute Javascript
14- response .html .render (timeout = 60 )
15- # create beautiful soup object to parse HTML
16- soup = bs (response .html .html , "html.parser" )
17- # open("index.html", "w").write(response.html.html)
18- # initialize the result
19- result = {}
20- # video title
21- result ["title" ] = soup .find ("meta" , itemprop = "name" )['content' ]
22- # video views
23- result ["views" ] = soup .find ("meta" , itemprop = "interactionCount" )['content' ]
24- # video description
25- result ["description" ] = soup .find ("meta" , itemprop = "description" )['content' ]
26- # date published
27- result ["date_published" ] = soup .find ("meta" , itemprop = "datePublished" )['content' ]
28- # get the duration of the video
29- result ["duration" ] = soup .find ("span" , {"class" : "ytp-time-duration" }).text
30- # get the video tags
31- result ["tags" ] = ', ' .join ([ meta .attrs .get ("content" ) for meta in soup .find_all ("meta" , {"property" : "og:video:tag" }) ])
32-
33- # Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
34- data = re .search (r"var ytInitialData = ({.*?});" , soup .prettify ()).group (1 )
35- data_json = json .loads (data )
36- videoPrimaryInfoRenderer = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ][0 ]['videoPrimaryInfoRenderer' ]
37- videoSecondaryInfoRenderer = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ][1 ]['videoSecondaryInfoRenderer' ]
38- # number of likes
39- likes_label = videoPrimaryInfoRenderer ['videoActions' ]['menuRenderer' ]['topLevelButtons' ][0 ]['toggleButtonRenderer' ]['defaultText' ]['accessibility' ]['accessibilityData' ]['label' ] # "No likes" or "###,### likes"
40- likes_str = likes_label .split (' ' )[0 ].replace (',' ,'' )
41- result ["likes" ] = '0' if likes_str == 'No' else likes_str
42- # number of likes (old way) doesn't always work
43- # text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
44- # result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
45- # result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
46- # number of dislikes - YouTube does not publish this anymore...
47- # result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
48- # result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
49- result ['dislikes' ] = 'UNKNOWN'
50- # channel details
51- channel_tag = soup .find ("meta" , itemprop = "channelId" )['content' ]
52- # channel name
53- channel_name = soup .find ("span" , itemprop = "author" ).next .next ['content' ]
54- # channel URL
55- # channel_url = soup.find("span", itemprop="author").next['href']
56- channel_url = f"https://www.youtube.com/{ channel_tag } "
57- # number of subscribers as str
58- channel_subscribers = videoSecondaryInfoRenderer ['owner' ]['videoOwnerRenderer' ]['subscriberCountText' ]['accessibility' ]['accessibilityData' ]['label' ]
59- # channel details (old way)
60- # channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
61- # # channel name (old way)
62- # channel_name = channel_tag.text
63- # # channel URL (old way)
64- # channel_url = f"https://www.youtube.com{channel_tag['href']}"
65- # number of subscribers as str (old way)
66- # channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
67- result ['channel' ] = {'name' : channel_name , 'url' : channel_url , 'subscribers' : channel_subscribers }
68- return result
8+ """
9+ Extract video information from YouTube using modern approach
10+ """
11+ headers = {
12+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13+ }
14+
15+ try :
16+ # Download HTML code
17+ response = requests .get (url , headers = headers )
18+ response .raise_for_status ()
19+
20+ # Create beautiful soup object to parse HTML
21+ soup = BeautifulSoup (response .text , "html.parser" )
22+
23+ # Initialize the result
24+ result = {}
25+
26+ # Extract ytInitialData which contains all the video information
27+ data_match = re .search (r'var ytInitialData = ({.*?});' , response .text )
28+ if not data_match :
29+ raise Exception ("Could not find ytInitialData in page" )
30+
31+ data_json = json .loads (data_match .group (1 ))
32+
33+ # Get the main content sections
34+ contents = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ]
35+
36+ # Extract video information from videoPrimaryInfoRenderer
37+ if 'videoPrimaryInfoRenderer' in contents [0 ]:
38+ primary = contents [0 ]['videoPrimaryInfoRenderer' ]
39+
40+ # Video title
41+ result ["title" ] = primary ['title' ]['runs' ][0 ]['text' ]
42+
43+ # Video views
44+ result ["views" ] = primary ['viewCount' ]['videoViewCountRenderer' ]['viewCount' ]['simpleText' ]
45+
46+ # Date published
47+ result ["date_published" ] = primary ['dateText' ]['simpleText' ]
48+
49+ # Extract channel information from videoSecondaryInfoRenderer
50+ secondary = None
51+ if 'videoSecondaryInfoRenderer' in contents [1 ]:
52+ secondary = contents [1 ]['videoSecondaryInfoRenderer' ]
53+ owner = secondary ['owner' ]['videoOwnerRenderer' ]
54+
55+ # Channel name
56+ channel_name = owner ['title' ]['runs' ][0 ]['text' ]
57+
58+ # Channel ID
59+ channel_id = owner ['navigationEndpoint' ]['browseEndpoint' ]['browseId' ]
60+
61+ # Channel URL - FIXED with proper /channel/ path
62+ channel_url = f"https://www.youtube.com/channel/{ channel_id } "
63+
64+ # Number of subscribers
65+ channel_subscribers = owner ['subscriberCountText' ]['accessibility' ]['accessibilityData' ]['label' ]
66+
67+ result ['channel' ] = {
68+ 'name' : channel_name ,
69+ 'url' : channel_url ,
70+ 'subscribers' : channel_subscribers
71+ }
72+
73+ # Extract video description
74+ if secondary and 'attributedDescription' in secondary :
75+ description_runs = secondary ['attributedDescription' ]['content' ]
76+ result ["description" ] = description_runs
77+ else :
78+ result ["description" ] = "Description not available"
79+
80+ # Try to extract video duration from player overlay
81+ # This is a fallback approach since the original method doesn't work
82+ duration_match = re .search (r'"approxDurationMs":"(\d+)"' , response .text )
83+ if duration_match :
84+ duration_ms = int (duration_match .group (1 ))
85+ minutes = duration_ms // 60000
86+ seconds = (duration_ms % 60000 ) // 1000
87+ result ["duration" ] = f"{ minutes } :{ seconds :02d} "
88+ else :
89+ result ["duration" ] = "Duration not available"
90+
91+ # Extract video tags if available
92+ video_tags = []
93+ if 'keywords' in data_json .get ('metadata' , {}).get ('videoMetadataRenderer' , {}):
94+ video_tags = data_json ['metadata' ]['videoMetadataRenderer' ]['keywords' ]
95+ result ["tags" ] = ', ' .join (video_tags ) if video_tags else "No tags available"
96+
97+ # Extract likes (modern approach)
98+ result ["likes" ] = "Likes count not available"
99+ result ["dislikes" ] = "UNKNOWN" # YouTube no longer shows dislikes
100+
101+ # Try to find likes in the new structure
102+ for content in contents :
103+ if 'compositeVideoPrimaryInfoRenderer' in content :
104+ composite = content ['compositeVideoPrimaryInfoRenderer' ]
105+ if 'likeButton' in composite :
106+ like_button = composite ['likeButton' ]
107+ if 'toggleButtonRenderer' in like_button :
108+ toggle = like_button ['toggleButtonRenderer' ]
109+ if 'defaultText' in toggle :
110+ default_text = toggle ['defaultText' ]
111+ if 'accessibility' in default_text :
112+ accessibility = default_text ['accessibility' ]
113+ if 'accessibilityData' in accessibility :
114+ label = accessibility ['accessibilityData' ]['label' ]
115+ if 'like' in label .lower ():
116+ result ["likes" ] = label
117+
118+ return result
119+
120+ except Exception as e :
121+ raise Exception (f"Error extracting video info: { str (e )} " )
69122
70123if __name__ == "__main__" :
71- import argparse
72124 parser = argparse .ArgumentParser (description = "YouTube Video Data Extractor" )
73125 parser .add_argument ("url" , help = "URL of the YouTube video" )
74126
75127 args = parser .parse_args ()
128+
76129 # parse the video URL from command line
77130 url = args .url
78131
79- data = get_video_info (url )
132+ try :
133+ data = get_video_info (url )
80134
81- # print in nice format
82- print (f"Title: { data ['title' ]} " )
83- print (f"Views: { data ['views' ]} " )
84- print (f"Published at: { data ['date_published' ]} " )
85- print (f"Video Duration: { data ['duration' ]} " )
86- print (f"Video tags: { data ['tags' ]} " )
87- print (f"Likes: { data ['likes' ]} " )
88- print (f"Dislikes: { data ['dislikes' ]} " )
89- print (f"\n Description: { data ['description' ]} \n " )
90- print (f"\n Channel Name: { data ['channel' ]['name' ]} " )
91- print (f"Channel URL: { data ['channel' ]['url' ]} " )
92- print (f"Channel Subscribers: { data ['channel' ]['subscribers' ]} " )
135+ # print in nice format
136+ print (f"Title: { data ['title' ]} " )
137+ print (f"Views: { data ['views' ]} " )
138+ print (f"Published at: { data ['date_published' ]} " )
139+ print (f"Video Duration: { data ['duration' ]} " )
140+ print (f"Video tags: { data ['tags' ]} " )
141+ print (f"Likes: { data ['likes' ]} " )
142+ print (f"Dislikes: { data ['dislikes' ]} " )
143+ print (f"\n Description: { data ['description' ]} \n " )
144+ print (f"\n Channel Name: { data ['channel' ]['name' ]} " )
145+ print (f"Channel URL: { data ['channel' ]['url' ]} " )
146+ print (f"Channel Subscribers: { data ['channel' ]['subscribers' ]} " )
147+
148+ except Exception as e :
149+ print (f"Error: { e } " )
150+ print ("\n Note: YouTube frequently changes its structure, so this script may need updates." )
0 commit comments