Python to Fetch Movie Information with Full Source Code

This script obtains movie details by scraping the IMDB website.

Prerequisites

  • beautifulsoup4
  • requests==2.23.0
  • Run to install required external modules.
pip install -r requirements.txt
Code language: CSS (css)

Run the Script:

<code>python3 movieInfoScraper.py</code>
Code language: HTML, XML (xml)
  • Type in the movie name when prompted.

Source Code:

movieInfoScraper.py

from bs4 import BeautifulSoup import requests # Function to get Movie Details def getMovieDetails(movieName): # Base URL of IMDB website url = 'https://www.imdb.com' # Query to find movie title query = '/search/title?title=' # Empty dictionary to store movie Details movieDetails = {} # Query formed movienamequery = query+'+'.join(movieName.strip().split(' ')) # WebPage is obtained and parsed html = requests.get(url+movienamequery+'&title_type=feature') bs = BeautifulSoup(html.text, 'html.parser') # Gets the first movie that appears in title section result = bs.find('h3', {'class': 'lister-item-header'}) if result is None: return None movielink = url+result.a.attrs['href'] movieDetails['name'] = result.a.text # Gets the page with movie details html = requests.get(movielink) bs = BeautifulSoup(html.text, 'html.parser') # Year try: movieDetails['year'] = bs.find('span', {'id': 'titleYear'}).a.text except AttributeError: movieDetails['year'] = 'Not available' subtext = bs.find('div', {'class': 'subtext'}) # Rating,Genres,Runtime,Release Date, movieDetails['genres'] = [ i.text for i in subtext.findAll('a', {'title': None})] try: movieDetails['rating'] = bs.find( 'div', {'class': 'ratingValue'}).span.text movieDetails['runtime'] = subtext.time.text.strip() except AttributeError: movieDetails['rating'] = 'Not yet rated' movieDetails['runtime'] = 'Not available' movieDetails['release_date'] = subtext.find( 'a', {'title': 'See more release dates'}).text.strip() # Gets the credit section of the page creditSummary = bs.findAll('div', {'class': 'credit_summary_item'}) # Directors,Writers and Cast movieDetails['directors'] = [i.text for i in creditSummary[0].findAll('a')] movieDetails['writers'] = [i.text for i in creditSummary[1].findAll( 'a') if 'name' in i.attrs['href']] try: movieDetails['cast'] = [i.text for i in creditSummary[2].findAll( 'a') if 'name' in i.attrs['href']] # For some films, writer details are not provided except IndexError: movieDetails['cast']=movieDetails['writers'] movieDetails['writers']='Not found' # The plot is seperate AJAX call and does not come in the html page, So one more request to plotsummary page html = requests.get(movielink+'plotsummary') bs = BeautifulSoup(html.text, 'html.parser') # Plot movieDetails['plot'] = bs.find( 'li', {'class': 'ipl-zebra-list__item'}).p.text.strip() # Returns the dictionary with movie details return movieDetails if __name__ == "__main__": movieName = input('Enter the movie name whose details are to be fetched\n') movieDetails = getMovieDetails(movieName) if movieDetails is None: print('No movie of this name found !!!!!') quit() print('\n{movie} ({year})'.format( movie=movieDetails['name'], year=movieDetails['year'])) print('Rating:', movieDetails['rating']) print('Runtime:', movieDetails['runtime']) print('Release Date:', movieDetails['release_date']) print('Genres:', ', '.join(movieDetails['genres'])) print('Director:', ', '.join(movieDetails['directors'])) print('Writer:', ', '.join(movieDetails['writers'])) print('Cast:', ', '.join(movieDetails['cast'])) print('Plot Summary:\n', movieDetails['plot'])
Code language: PHP (php)

Output:

Leave a Comment