Scraping course catalog from IIT-Kharagpur Computer Science and Engineering department

Author - Rohan Manoj Thakkar

In [1]:
from bs4 import BeautifulSoup; #import beautiful soup scraper
In [2]:
import urllib2 #url library 
import pandas as pd; #dataframe for final output 
import re #regular expression package
import warnings
warnings.filterwarnings('ignore')
In [3]:
url_new= 'http://cse.iitkgp.ac.in/oldlook/curriculum.html' #set url 
page_new = urllib2.urlopen(url_new) #open the url 
soup_new = BeautifulSoup(page_new.read()) #read in beautiful soup
In [4]:
#Retrieving all the 6 tables in the page 
tables = [s for s in soup_new.find_all('table')]

#All cells in the first table
org = [s.get_text() for s in tables[0].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[0].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#empty lists to store id, name and prerequisites of each course throughout the page
course_id = []
course_name = []
prereq = []   
    
#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i])
In [5]:
#All cells in the second table
org = [s.get_text() for s in tables[1].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[1].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i]) 
In [6]:
#All cells in the third table
org = [s.get_text() for s in tables[2].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[2].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%7==2):
        course_id.append(org[i])
    if(i%7==3):
        course_name.append(org[i])
    if(i%7==6):
        prereq.append(org[i]) 
In [7]:
#All cells in the fifth table (skipping fourth table since it doesn't have the prerequisites column)
org = [s.get_text() for s in tables[4].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[4].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%5==0):
        course_id.append(org[i])
    if(i%5==1):
        course_name.append(org[i])
    if(i%5==4):
        prereq.append(org[i]) 
In [8]:
#All cells in the sixth table
org = [s.get_text() for s in tables[5].find_all('td')]

#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[5].findAll("td", { "valign":"top" })]

for i in maaro:
    org.remove(i)

#filling the 3 required columns    
for i in range(len(org)):
    if(i%5==0):
        course_id.append(org[i])
    if(i%5==1):
        course_name.append(org[i])
    if(i%5==4):
        prereq.append(org[i]) 
In [9]:
#to store date of scraping, university name, department name
date = []
university = []
department = []

import time
#date when scraping occurs
current_date = time.strftime("%d/%m/%Y")

for i in range(len(course_id)):
    date.append(current_date)
    #since data is being imported from CSE department of IIT-Kharagpur
    university.append('IIT-Kharagpur')
    department.append('Computer Science and Engineering')

#new data-frame to store catalog
data = pd.DataFrame(columns=['date','university','department','code','name','prerequisites'])
data.date = date
data.university = university
data.department = department
data.code = course_id
data.name = course_name
data.prerequisites = prereq

#Removing all the next-line characters at the end of each cell, which are of no use to us
data.prerequisites = data.prerequisites.str.replace('\n','')
In [10]:
#Removing all the 'xa0' unicode characters which are of no use to us
data.code = data.code.replace(u'\xa0', u' ')
data.name = data.name.replace(u'\xa0', u' ')

data.prerequisites = data.prerequisites.replace(u'\xa0', u' ')

#since prerequisites are required to be separated b ';'
data.prerequisites = data.prerequisites.str.replace(',', ';')

#To standardize so that all courses with no prerequisites have no text in the 3rd column
data.prerequisites = data.prerequisites.replace(u'None', u' ')
In [11]:
code = []
name = []
pre = []

#Converting to string format 
for i in data.code:
    #since it spits out an additional unwanted 'xc2' when 'xa0' appears before 
    code.append((i.encode('utf-8')).replace('\xc2\xa0',''))
for i in data.name:
    name.append(i.encode('utf-8'))
for i in data.prerequisites:
    pre.append(i.encode('utf-8'))
    
#Replacing all 3 columns with string versions of columns
data.code = code
data.name = name
data.prerequisites = pre
In [12]:
no_of_preq = []

#Calculating number of prerequisites for each course
for i in range(len(data.code)):
    count = data.prerequisites[i].count(';')
    if count==0 and len(data.prerequisites[i])==1:
        no_of_preq.append(0)
    else:
        no_of_preq.append(count+1)

#Adding this as a new column to data-frame
data['number_of_prerequisites'] = no_of_preq     
In [13]:
#displaying the first 5 courses in the catalog
print 'First five courses-'
data.head(5)
First five courses-
Out[13]:
date university department code name prerequisites number_of_prerequisites
0 11/04/2016 IIT-Kharagpur Computer Science and Engineering MA10001 Mathematics-I 0
1 11/04/2016 IIT-Kharagpur Computer Science and Engineering PH10001/CY10001 Physics/Chemistry 0
2 11/04/2016 IIT-Kharagpur Computer Science and Engineering ME10001/HS11001 Mechanics/English for Communication 0
3 11/04/2016 IIT-Kharagpur Computer Science and Engineering EE10001/CS11001 Electrical Technology/Programming and Data Str... 0
4 11/04/2016 IIT-Kharagpur Computer Science and Engineering EE19001/CS19001 Electrical Technology Lab/Programming and Data... 0
In [14]:
#displaying the bottom 5 courses in the catalog
print 'Last five courses-',
data.tail(5)
Last five courses-
Out[14]:
date university department code name prerequisites number_of_prerequisites
146 11/04/2016 IIT-Kharagpur Computer Science and Engineering CS60076 Advances in Digital and Mixed Signal Testing 0
147 11/04/2016 IIT-Kharagpur Computer Science and Engineering CS60078 Complex Networks CS11001/02; CS19001/02; CS21003; CS29003 4
148 11/04/2016 IIT-Kharagpur Computer Science and Engineering CS60080 Information Retrieval 0
149 11/04/2016 IIT-Kharagpur Computer Science and Engineering CS60082 Computational Number Theory 0
150 11/04/2016 IIT-Kharagpur Computer Science and Engineering CS60084 Foundations of Cryptography 0

In [15]:
#To plot histogram
import matplotlib.pyplot as plt
% matplotlib inline
# Spreading the plot to wider extent
fig = plt.figure(figsize=(15,5))

# Creating a subplot of matplotlib to incorporate required modifications
ax = fig.add_subplot(111)

# Hiding the right and top axes
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

#Histogram of number of prerequisites (excluding courses without course IDs because they are undetermined electives and 
#will, thus, naturally lack any predefined prerequisite)
ax.hist(data[data.code!=' '].number_of_prerequisites)

# Labeling the graph
plt.ylabel("Number of courses")
plt.xlabel("Number of prerequisities")
plt.title("Histogram of number of prerequisities for courses in IIT-Kharagpur")

#saving the resultant histogram as an image 
fig.savefig('Downloads\\Histogram_visualization.png')
#displaying the histogram below
fig.show()
In [16]:
#Exporting data-frame to a csv file
data.to_csv('Downloads\\IIT-Kharagpur_course_catalog.csv')