Python to Scrape Medium Articles with Full Source Code For Beginners

  • Medium is a website containing great articles and used by many programmers.
  • This script asks the user for the url of a medium article, scrapes it’s text and saves it to a text file into a folder named scraped_articles in the same directory.
  • There are 3 text files in the folder scraped_articles as an example of how the article is scraped.


  • Run pip install -r requirements.txt to install required external modules.


  • beautifulsoup4
  • requests==2.23.0

Run the Script:


python3 scraping_medium.pyCode language: CSS (css)

Source Code:

import os
import sys
import requests
import re
from bs4 import BeautifulSoup

# switching to current running python files directory

# function to get the html of the page
def get_page():
	global url
	url = input('Enter url of a medium article: ')
	# handling possible error
	if not re.match(r'https?://',url):
		print('Please enter a valid website, or make sure it is a medium article')
	res = requests.get(url)
	soup = BeautifulSoup(res.text, 'html.parser')
	return soup

# function to remove all the html tags and replace some with specific strings
def purify(text):
    rep = {"<br>": "\n", "<br/>": "\n", "<li>":  "\n"}
    rep = dict((re.escape(k), v) for k, v in rep.items()) 
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda m: rep[re.escape(], text)
    text = re.sub('\<(.*?)\>', '', text)
    return text

# function to compile all of the scraped text in one string
def collect_text(soup):
	fin = f'url: {url}\n\n'
	main = (soup.head.title.text).split('|')
	global title
	title = main[0].strip()
	fin += f'Title: {title.upper()}\n{main[1].strip()}'

	header = soup.find_all('h1')
	j = 1

		fin += '\n\nINTRODUCTION\n'
		for elem in list(header[j].previous_siblings)[::-1]:
			fin += f'\n{purify(str(elem))}'

	fin += f'\n\n{header[j].text.upper()}'
	for elem in header[j].next_siblings:
		if == 'h1':
			fin += f'\n\n{header[j].text.upper()}'
		fin += f'\n{purify(str(elem))}'
	return fin

# function to save file in the current directory
def save_file(fin):
	if not os.path.exists('./scraped_articles'):
	fname = './scraped_articles/' + '_'.join(title.split()) + '.txt'
	with open(fname, 'w', encoding='utf8') as outfile:
	print(f'File saved in directory {fname}')

# driver code
if __name__ == '__main__':
	fin = collect_text(get_page())
	save_file(fin)Code language: PHP (php)

Leave a Comment