from bs4 import BeautifulSoup; #import beautiful soup scraper
import urllib2 #url library
import pandas as pd; #dataframe for final output
import re #regular expression package
import warnings
warnings.filterwarnings('ignore')
url_new= 'http://cse.iitkgp.ac.in/oldlook/curriculum.html' #set url
page_new = urllib2.urlopen(url_new) #open the url
soup_new = BeautifulSoup(page_new.read()) #read in beautiful soup
#Retrieving all the 6 tables in the page
tables = [s for s in soup_new.find_all('table')]
#All cells in the first table
org = [s.get_text() for s in tables[0].find_all('td')]
#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[0].findAll("td", { "valign":"top" })]
for i in maaro:
org.remove(i)
#empty lists to store id, name and prerequisites of each course throughout the page
course_id = []
course_name = []
prereq = []
#filling the 3 required columns
for i in range(len(org)):
if(i%7==2):
course_id.append(org[i])
if(i%7==3):
course_name.append(org[i])
if(i%7==6):
prereq.append(org[i])
#All cells in the second table
org = [s.get_text() for s in tables[1].find_all('td')]
#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[1].findAll("td", { "valign":"top" })]
for i in maaro:
org.remove(i)
#filling the 3 required columns
for i in range(len(org)):
if(i%7==2):
course_id.append(org[i])
if(i%7==3):
course_name.append(org[i])
if(i%7==6):
prereq.append(org[i])
#All cells in the third table
org = [s.get_text() for s in tables[2].find_all('td')]
#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[2].findAll("td", { "valign":"top" })]
for i in maaro:
org.remove(i)
#filling the 3 required columns
for i in range(len(org)):
if(i%7==2):
course_id.append(org[i])
if(i%7==3):
course_name.append(org[i])
if(i%7==6):
prereq.append(org[i])
#All cells in the fifth table (skipping fourth table since it doesn't have the prerequisites column)
org = [s.get_text() for s in tables[4].find_all('td')]
#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[4].findAll("td", { "valign":"top" })]
for i in maaro:
org.remove(i)
#filling the 3 required columns
for i in range(len(org)):
if(i%5==0):
course_id.append(org[i])
if(i%5==1):
course_name.append(org[i])
if(i%5==4):
prereq.append(org[i])
#All cells in the sixth table
org = [s.get_text() for s in tables[5].find_all('td')]
#Removing the cells corresponding 'semester' column of the table which is useless for us
maaro = [(s.get_text()) for s in tables[5].findAll("td", { "valign":"top" })]
for i in maaro:
org.remove(i)
#filling the 3 required columns
for i in range(len(org)):
if(i%5==0):
course_id.append(org[i])
if(i%5==1):
course_name.append(org[i])
if(i%5==4):
prereq.append(org[i])
#to store date of scraping, university name, department name
date = []
university = []
department = []
import time
#date when scraping occurs
current_date = time.strftime("%d/%m/%Y")
for i in range(len(course_id)):
date.append(current_date)
#since data is being imported from CSE department of IIT-Kharagpur
university.append('IIT-Kharagpur')
department.append('Computer Science and Engineering')
#new data-frame to store catalog
data = pd.DataFrame(columns=['date','university','department','code','name','prerequisites'])
data.date = date
data.university = university
data.department = department
data.code = course_id
data.name = course_name
data.prerequisites = prereq
#Removing all the next-line characters at the end of each cell, which are of no use to us
data.prerequisites = data.prerequisites.str.replace('\n','')
#Removing all the 'xa0' unicode characters which are of no use to us
data.code = data.code.replace(u'\xa0', u' ')
data.name = data.name.replace(u'\xa0', u' ')
data.prerequisites = data.prerequisites.replace(u'\xa0', u' ')
#since prerequisites are required to be separated b ';'
data.prerequisites = data.prerequisites.str.replace(',', ';')
#To standardize so that all courses with no prerequisites have no text in the 3rd column
data.prerequisites = data.prerequisites.replace(u'None', u' ')
code = []
name = []
pre = []
#Converting to string format
for i in data.code:
#since it spits out an additional unwanted 'xc2' when 'xa0' appears before
code.append((i.encode('utf-8')).replace('\xc2\xa0',''))
for i in data.name:
name.append(i.encode('utf-8'))
for i in data.prerequisites:
pre.append(i.encode('utf-8'))
#Replacing all 3 columns with string versions of columns
data.code = code
data.name = name
data.prerequisites = pre
no_of_preq = []
#Calculating number of prerequisites for each course
for i in range(len(data.code)):
count = data.prerequisites[i].count(';')
if count==0 and len(data.prerequisites[i])==1:
no_of_preq.append(0)
else:
no_of_preq.append(count+1)
#Adding this as a new column to data-frame
data['number_of_prerequisites'] = no_of_preq
#displaying the first 5 courses in the catalog
print 'First five courses-'
data.head(5)
#displaying the bottom 5 courses in the catalog
print 'Last five courses-',
data.tail(5)
#To plot histogram
import matplotlib.pyplot as plt
% matplotlib inline
# Spreading the plot to wider extent
fig = plt.figure(figsize=(15,5))
# Creating a subplot of matplotlib to incorporate required modifications
ax = fig.add_subplot(111)
# Hiding the right and top axes
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
#Histogram of number of prerequisites (excluding courses without course IDs because they are undetermined electives and
#will, thus, naturally lack any predefined prerequisite)
ax.hist(data[data.code!=' '].number_of_prerequisites)
# Labeling the graph
plt.ylabel("Number of courses")
plt.xlabel("Number of prerequisities")
plt.title("Histogram of number of prerequisities for courses in IIT-Kharagpur")
#saving the resultant histogram as an image
fig.savefig('Downloads\\Histogram_visualization.png')
#displaying the histogram below
fig.show()
#Exporting data-frame to a csv file
data.to_csv('Downloads\\IIT-Kharagpur_course_catalog.csv')