Scraping course catalog from IIT-Kharagpur Computer Science and Engineering department¶

Author - Rohan Manoj Thakkar¶

from bs4 import BeautifulSoup; #import beautiful soup scraper

import urllib2 #url library 
import pandas as pd; #dataframe for final output 
import re #regular expression package
import warnings
warnings.filterwarnings('ignore')

url_new= 'http://cse.iitkgp.ac.in/oldlook/curriculum.html' #set url 
page_new = urllib2.urlopen(url_new) #open the url 
soup_new = BeautifulSoup(page_new.read()) #read in beautiful soup

#Retrieving all the 6 tables in the page 
tables = [s for s in soup_new.find_all('table')]

#All cells in the first table
org = [s.get_text() for s in tables[0].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[0].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#empty lists to store id, name and prerequisites of each course throughout the page
course_id = []
course_name = []
prereq = []   
    
#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i])

#All cells in the second table
org = [s.get_text() for s in tables[1].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[1].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i])

#All cells in the third table
org = [s.get_text() for s in tables[2].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[2].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i])

#All cells in the fifth table (skipping fourth table since it doesn't have the prerequisites column)
org = [s.get_text() for s in tables[4].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[4].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%5==0):
        course_id.append(org[i])
    if(i%5==1):
        course_name.append(org[i])
    if(i%5==4):
        prereq.append(org[i])

#All cells in the sixth table
org = [s.get_text() for s in tables[5].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[5].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%5==0):
        course_id.append(org[i])
    if(i%5==1):
        course_name.append(org[i])
    if(i%5==4):
        prereq.append(org[i])

#to store date of scraping, university name, department name
date = []
university = []
department = []

import time
#date when scraping occurs
current_date = time.strftime("%d/%m/%Y")

for i in range(len(course_id)):
    date.append(current_date)
    #since data is being imported from CSE department of IIT-Kharagpur
    university.append('IIT-Kharagpur')
    department.append('Computer Science and Engineering')

#new data-frame to store catalog
data = pd.DataFrame(columns=['date','university','department','code','name','prerequisites'])
data.date = date
data.university = university
data.department = department
data.code = course_id
data.name = course_name
data.prerequisites = prereq

#Removing all the next-line characters at the end of each cell, which are of no use to us
data.prerequisites = data.prerequisites.str.replace('\n','')

#Removing all the 'xa0' unicode characters which are of no use to us
data.code = data.code.replace(u'\xa0', u' ')
data.name = data.name.replace(u'\xa0', u' ')

data.prerequisites = data.prerequisites.replace(u'\xa0', u' ')

#since prerequisites are required to be separated b ';'
data.prerequisites = data.prerequisites.str.replace(',', ';')

#To standardize so that all courses with no prerequisites have no text in the 3rd column
data.prerequisites = data.prerequisites.replace(u'None', u' ')

code = []
name = []
pre = []

#Converting to string format 
for i in data.code:
    #since it spits out an additional unwanted 'xc2' when 'xa0' appears before 
    code.append((i.encode('utf-8')).replace('\xc2\xa0',''))
for i in data.name:
    name.append(i.encode('utf-8'))
for i in data.prerequisites:
    pre.append(i.encode('utf-8'))
    
#Replacing all 3 columns with string versions of columns
data.code = code
data.name = name
data.prerequisites = pre

no_of_preq = []

#Calculating number of prerequisites for each course
for i in range(len(data.code)):
    count = data.prerequisites[i].count(';')
    if count==0 and len(data.prerequisites[i])==1:
        no_of_preq.append(0)
    else:
        no_of_preq.append(count+1)

#Adding this as a new column to data-frame
data['number_of_prerequisites'] = no_of_preq

#displaying the first 5 courses in the catalog
print 'First five courses-'
data.head(5)

First five courses-

#displaying the bottom 5 courses in the catalog
print 'Last five courses-',
data.tail(5)

Last five courses-

#To plot histogram
import matplotlib.pyplot as plt
% matplotlib inline
# Spreading the plot to wider extent
fig = plt.figure(figsize=(15,5))

# Creating a subplot of matplotlib to incorporate required modifications
ax = fig.add_subplot(111)

# Hiding the right and top axes
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

#Histogram of number of prerequisites (excluding courses without course IDs because they are undetermined electives and 
#will, thus, naturally lack any predefined prerequisite)
ax.hist(data[data.code!=' '].number_of_prerequisites)

# Labeling the graph
plt.ylabel("Number of courses")
plt.xlabel("Number of prerequisities")
plt.title("Histogram of number of prerequisities for courses in IIT-Kharagpur")

#saving the resultant histogram as an image 
fig.savefig('Downloads\\Histogram_visualization.png')
#displaying the histogram below
fig.show()

#Exporting data-frame to a csv file
data.to_csv('Downloads\\IIT-Kharagpur_course_catalog.csv')

	date	university	department	code	name	prerequisites	number_of_prerequisites
146	11/04/2016	IIT-Kharagpur	Computer Science and Engineering	CS60076	Advances in Digital and Mixed Signal Testing		0
147	11/04/2016	IIT-Kharagpur	Computer Science and Engineering	CS60078	Complex Networks	CS11001/02; CS19001/02; CS21003; CS29003	4
148	11/04/2016	IIT-Kharagpur	Computer Science and Engineering	CS60080	Information Retrieval		0
149	11/04/2016	IIT-Kharagpur	Computer Science and Engineering	CS60082	Computational Number Theory		0
150	11/04/2016	IIT-Kharagpur	Computer Science and Engineering	CS60084	Foundations of Cryptography		0