I am new to python scrapping. I have taken this code quite long ago from this website. I am trying to change some stuff so that i can my self crawl links from google and then index them and stored them. But problem is that i am facing strange error i dont know why this is occuring.
Kindly help. My code (i am also placing link of file: http://www.uploadmb.com/dw.php?id=1455203992)
#!/usr/bin/env python
try:
import psyco
psyco.full()
except:
pass
from pylab import*
from scipy import stats
from copy import deepcopy
import urllib
from urllib2 import unquote
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
import sqlite3 as sqlite
stem = False
if stem: import pracstem
# There are two lines,110 and 138, in this program that make use of this escape character as the condition. Uncomment both of them.
# Add more unwanted characters in url.
#escapeChar=['add some characters here']
# Create a list of words to ignore
ignorewords = file('stopwords').read().split('\n')
ignorewords = set([word for word in ignorewords if len(word) > 2])
iter = 20
### CLASS CRAWLER STARTS HERE###
class crawler (object):
# Initialize the crawler with the name of database
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def dynamicpage(self, page):
if '?' or '%' in page: return True
else: return False
def contentlength(self, soup):
content = ''
for paragraph in soup('p'):
for node in paragraph.contents:
if type(node)!=None:
content += str(node.string.encode('utf-8'))
content += ' '
return len(content)
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self, table, field, value, createnew = True):
cur = self.con.execute ("select rowid from %s where %s = '%s'" %(table, field, value))
res = cur.fetchone()
if res is None:
cur = self.con.execute("insert into %s (%s) values('%s')" %(table, field, value))
return cur.lastrowid
else:
return res[0]
# Index an individual page
def addtoindex(self, url, soup):
if self.isindexed (url): return
print 'Indexing ' + url
# Get the individual words
text = self.gettextonly(soup)
words = self.separatewords(text)
if stem: words = pracstem.stem(words)
# Get the URL id
urlid = self.getentryid('urllist', 'url', url)
# Link each word to this url
for i in range(len(words)):
word = words[i]
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute("insert into wordlocation(urlid, wordid, location) values(%i, %i, %i)" %(urlid, wordid, i))
# This function will only index title and paragraph content of the page
def addtoindexnew(self, url, soup):
if self.isindexed (url): return
print 'Indexing ' + url
#Get only contents of each paragraph
content = ''
for paragraph in soup('p'):
for node in paragraph.contents:
if type(node)!=None:
content += str(node.string).encode('utf-8')
content += ' '
#Get also title of the page
title = soup('title')[0].string
#title + content
text = title + ' ' + content
# Get the individual words
words = self.separatewords(text)
if stem: words = pracstem.stem(words)
# Get the URL id
urlid = self.getentryid('urllist', 'url', url)
# Link each word to this url
for i in range(len(words)):
word = words[i]
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute("insert into wordlocation(urlid, wordid, location) values(%i, %i, %i)" %(urlid, wordid, i))
# Extract the text from an HTML page (no tags)
def gettextonly(self, soup):
v = soup.string
if v is None:
c = soup.contents
resulttext = ''
for t in c:
subtext = self.gettextonly(t)
resulttext += subtext + '\n'
return resulttext
else: return v.strip()
# Separate the words by any non-whitespace character
def separatewords(self, text):
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if len(s)>2 and len(s)<20]
# Return true if this url is already indexed
def isindexed(self, url):
u = self.con.execute("select rowid from urllist where url = '%s'" %url).fetchone()
if u is not None:
# Check if it has actually been crawled
v = self.con.execute('select * from wordlocation where urlid = %i' %u[0]).fetchone ()
if v is not None: return True
return False
# Add a link between two pages
def addlinkref(self, urlFrom, urlTo, linkText):
words = self.separatewords(linkText)
if stem: words = pracstem.stem(words)
fromid = self.getentryid('urllist', 'url', urlFrom)
toid = self.getentryid('urllist', 'url', urlTo)
if fromid == toid: return
cur = self.con.execute("insert into link(fromid, toid) values(%i, %i)" %(fromid, toid))
linkid = cur.lastrowid
for word in words:
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute("insert into linkwords(linkid, wordid) values(%i, %i)" %(linkid, wordid))
# Starting with a list of pages, do a breadth
# First search to the given depth, indexing pages as we go
def crawl(self, pages, depth = 2):
d = requests.get("https://www.google.dz/search?q="+pages)
for i in range(depth):
newpages = set()
# if (escapeChar[i] in page for i in range(len(escapeChar))): continue #Uncomment to activate this condition
#try:
#urllib2.urlopen(page)
#except:
# print "Could not open %s" %page
# continue
# try:
# except:
# print "Couldn't Beauticate %s" %page
# continue
# See if the page content long enough
# if self.contentlength(soup) < 500:
# continue
# Find the hyperlinks and index its anchor text
self.createindextables()
soup = BeautifulSoup(d.content)
#try:
self.addtoindexnew(page, soup) #Index the content of the page
#except:
# print "Could't index this page"
# continue
url = soup.findAll("a")
for link in url:
if link['href'].startswith('/url?q=') \
and 'webcache.googleusercontent.com' not in link['href']:
print link['href'].split('/url?q=')[1].split('&')[0]
# if (escapeChar[i] in url for i in range(len(escapeChar))): continue #Uncomment to activate this condition
# If both page and hyperlink on it are dynamic page, do not add the link to the queue
if self.dynamicpage(page) and self.dynamicpage(url): continue
if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
try:
self.addtoindexnew(page, soup) #Index the content of the page
except:
print "Could't index this page"
continue
self.dbcommit()
pages = newpages
def createindextables(self):
self.con.execute('create table urllist(url text)')
self.con.execute('create table wordlist(word text)')
self.con.execute('create table wordlocation(urlid integer, wordid integer, location integer)')
self.con.execute('create table link(fromid integer, toid integer)')
self.con.execute('create table linkwords(linkid integer, wordid integer)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.con.execute('create index linkwordidx on linkwords(wordid)')
self.con.execute('create index linkidx on linkwords(linkid)')
self.dbcommit()
def calculatelength(self):
self.con.execute("drop table if exists pagelength")
self.con.execute("create table pagelength(urlid integer primary key, length integer)")
self.con.execute("insert into pagelength select rowid, 0 from urllist")
self.dbcommit()
for (urlid,) in self.con.execute('select rowid from urllist'):
loc = self.con.execute("select location from wordlocation where urlid=%i" %urlid)
locs = [id for id in loc]
if len(locs) == 0: continue
length = locs[len(locs)-1][0]
self.con.execute("update pagelength set length = %i where urlid = %i" %(length, urlid))
self.dbcommit()
def calculatepagerank(self, iter):
# Clear out the current pagerank tables
self.con.execute('drop table if exists pagerank')
# Create pagerank table
self.con.execute('create table pagerank(urlid integer primary key, score float)')
# Clear out the current prconvergence table
self.con.execute('drop table if exists prconvergence')
# Create prconvergence table
self.con.execute('create table prconvergence(residu float)')
# Calculate number of pages in the database and create array that store current ranking vector
n = self.con.execute('select count(*) from urllist').fetchone()[0]
# Initialize every url with a PageRank of PR0
pr0 = (1.0/n)
self.con.execute('insert into pagerank select rowid, %f from urllist' %pr0)
self.dbcommit()
# create array that store current scores
currentscores = zeros(n)
# Start the iterations
for i in range(iter):
prevscores = deepcopy(currentscores)
print "Iteration %i" %(i)
for (urlid,) in self.con.execute('select rowid from urllist'):
pr = 0.15/n
# Loop through all the pages that link to this one
for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
# Get the PageRank of the linker
linkingpr = self.con.execute('select score from pagerank where urlid = %i' %linker).fetchone()[0]
# Get the total number of links from the linker
linkingcount = self.con.execute('select count(*) from link where fromid = %i' %linker).fetchone()[0]
pr += 0.85*linkingpr/linkingcount
currentscores[urlid-1] = pr
self.con.execute('update pagerank set score = %f where urlid = %i' %(pr, urlid))
residu = sum(abs(currentscores-prevscores))
self.con.execute('insert into prconvergence(residu) values(%f)'%residu)
self.dbcommit()
def calculatepagerank2(self, iter):
# Clear out the current pagerank tables
self.con.execute('drop table if exists pagerank')
# Create pagerank table
self.con.execute('create table pagerank(urlid integer primary key, score float)')
# Clear out the current prconvergence table
self.con.execute('drop table if exists prconvergence')
# Create prconvergence table
self.con.execute('create table prconvergence(residu float)')
# Calculate number of pages in the database and create array that store current ranking vector
n = self.con.execute('select count(*) from urllist').fetchone()[0]
# Initialize every url with a PageRank of PR0
pr0 = 1.0/n
self.con.execute('insert into pagerank select rowid, %f from urllist' %pr0)
self.dbcommit()
# create array that store current scores
currentscores = zeros(n)
# Start the iterations
for i in range(iter):
prevscores = deepcopy(currentscores)
dscore = 0.0
print "Iteration %i" %(i)
# Score from dangling nodes
for (urlid,) in self.con.execute('select rowid from urllist'):
if self.con.execute('select val from dnode where id = %i' %urlid).fetchone() == 1:
dscore += self.con.execute('select score from pagerank where urlid = %i' %urlid).fetchone()
for (urlid,) in self.con.execute('select rowid from urllist'):
pr = (0.15 + 0.85*dscore)/n
# Loop through all the pages that link to this one
for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
# Get the PageRank of the linker
linkingpr = self.con.execute('select score from pagerank where urlid = %i' %linker).fetchone()[0]
# Get the total number of links from the linker
linkingcount = self.con.execute('select count(*) from link where fromid = %i' %linker).fetchone()[0]
pr += 0.85*linkingpr/linkingcount
currentscores[urlid-1] = pr
self.con.execute('update pagerank set score = %f where urlid = %i' %(pr, urlid))
residu = sum(abs(currentscores-prevscores))
self.con.execute('insert into prconvergence(residu) values(%f)'%residu)
self.dbcommit()
def calculatemypagerank(self, iter):
# Clear out the current mypagerank table
self.con.execute('drop table if exists mypagerank')
# Create mypagerank table
self.con.execute('create table mypagerank(urlid integer primary key, score float, cp float)')
# Clear out the current myprconvergence table
self.con.execute('drop table if exists myprconvergence')
# Create myprconvergence table
self.con.execute('create table myprconvergence(residu float)')
# Calculate number of pages in the database and create array that store current ranking vector
n = self.con.execute('select count(*) from urllist').fetchone()[0]
mpr0 = 1.0/n
# Initialize every url with a mypagerank and cp
for (urlid,) in self.con.execute('select rowid from urllist'):
self.con.execute("insert into mypagerank(urlid, score, cp) values (%i, %f, %f)" %(urlid, mpr0, 1.0))
self.dbcommit()
# Find number of inlinks and outlinks of each url
for (urlid,) in self.con.execute('select rowid from urllist'):
inlinks = self.con.execute('select count (*) from link where toid = %i' %urlid).fetchone()[0]
outlinks = self.con.execute('select count (*) from link where fromid = %i' %urlid).fetchone()[0]
if outlinks == 0: outlinks = 0.1
if inlinks > outlinks: p = 1
elif inlinks < outlinks: p = -1
else: p = 0
# Calculate cp
cp = abs (inlinks - outlinks)**p
cp *= (inlinks + outlinks)*1/float(outlinks)
self.con.execute('update mypagerank set cp = %f where urlid = %i' %(cp, urlid))
self.dbcommit()
# create array that store current ranking vector
currentscores = zeros(n)
#Start the score calculation
for i in range(iter):
prevscores = deepcopy(currentscores)
print "Iteration %i" %(i)
for (urlid,) in self.con.execute('select rowid from urllist'):
score = 0.15/n
# Loop through all the pages that link to this one
for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
# Get linker score from mypagerank
linkingscore = self.con.execute('select score from mypagerank where urlid = %i' %linker).fetchone()[0]
# Get linker cp from mypagerank
linker_cp = self.con.execute('select cp from mypagerank where urlid = %i' %linker).fetchone()[0]
score += 0.85*linkingscore*linker_cp
currentscores[urlid-1] = score
self.con.execute('update mypagerank set score = %f where urlid = %i' %(score, urlid))
residu = sum(abs(currentscores-prevscores))
self.con.execute('insert into myprconvergence(residu) values(%f)'%residu)
self.dbcommit()
def calculatehits (self, iter):
# Clear out current auth_hits tables if already existed
self.con.execute('drop table if exists auth_hits')
# Create auth_hits tables
self.con.execute('create table auth_hits(urlid integer primary key, score float)')
# Clear out current hub_hits tables if already existed
self.con.execute('drop table if exists hub_hits')
# Create hub_hits tables
self.con.execute('create table hub_hits(urlid integer primary key, score float)')
# Clear out current auth_hitsconvergence tables if already existed
self.con.execute('drop table if exists auth_hitsconvergence')
# Create auth_hitsconvergence tables
self.con.execute('create table auth_hitsconvergence(residu float)')
# Clear out current hub_hitsconvergence tables if already existed
self.con.execute('drop table if exists hub_hitsconvergence')
# Create hub_hitsconvergence tables
self.con.execute('create table hub_hitsconvergence(residu float)')
# calculate number of pages in database
n = self.con.execute('select count(*) from urllist').fetchone()[0]
# initial value
intval = 1.0/n
# Initialize every url with a authority and hub scores
self.con.execute('insert into auth_hits select rowid, %f from urllist' %intval)
self.con.execute('insert into hub_hits select rowid, %f from urllist' %intval)
self.dbcommit()
auth_currentscores = zeros(n)
hub_currentscores = zeros(n)
# Start the iterations
for i in range(iter):
agg_authScore = 0.0
agg_hubScore = 0.0
print "Iteration %i" %(i)
# Authority part
auth_prevscores = deepcopy(auth_currentscores)
for (urlid,) in self.con.execute ('select rowid from urllist'):
authScore = 0.0
# Loop through all the pages that link to this one
for (linkerHub,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
# Get the hub scores of the linker
linker_hubScore = self.con.execute('select score from hub_hits where urlid = %i' %linkerHub).fetchone()[0]
authScore += linker_hubScore
self.con.execute('update auth_hits set score = %f where urlid = %i' %(authScore, urlid))
agg_authScore += authScore
for (urlid,) in self.con.execute('select rowid from urllist'):
NauthScore = self.con.execute('select score from auth_hits where urlid = %i' %urlid).fetchone()[0]
normalized_authScore = NauthScore*1/agg_authScore
auth_currentscores[urlid-1] = normalized_authScore
self.con.execute('update auth_hits set score = %f where urlid = %i' %(normalized_authScore, urlid))
self.dbcommit()
# Hub part
hub_prevscores = deepcopy(hub_currentscores)
for (urlid,) in self.con.execute('select rowid from urllist'):
hubScore = 0.0
# Loop through all the pages that being linked by this one
for (linkerAuth,) in self.con.execute('select distinct toid from link where fromid = %i' %urlid):
# Get the authority scores of the linker
linker_authScore = self.con.execute('select score from auth_hits where urlid = %i' %linkerAuth).fetchone()[0]
hubScore += linker_authScore
self.con.execute('update hub_hits set score = %f where urlid = %i' %(hubScore, urlid))
agg_hubScore += hubScore
for (urlid,) in self.con.execute('select rowid from urllist'):
NhubScore = self.con.execute('select score from hub_hits where urlid = %i' %urlid).fetchone()[0]
normalized_hubScore = NhubScore*1/agg_hubScore
hub_currentscores[urlid-1] = normalized_hubScore
self.con.execute ('update hub_hits set score = %f where urlid = %i' %(normalized_hubScore, urlid))
auth_res = sum(abs(auth_currentscores-auth_prevscores))
self.con.execute('insert into auth_hitsconvergence(residu) values(%f)'%auth_res)
hub_res = sum(abs(hub_currentscores-hub_prevscores))
self.con.execute('insert into hub_hitsconvergence(residu) values(%f)'%hub_res)
self.dbcommit()
def calculatemyhits(self, iter):
# Clear out current auth_myhits table if have already existed
self.con.execute('drop table if exists auth_myhits')
# Create auth_myhits table
self.con.execute('create table auth_myhits(urlid integer primary key, score float, ca float)')
# Clear out current hub_myhits table if have already existed
self.con.execute('drop table if exists hub_myhits')
# Create hub_myhits table
self.con.execute('create table hub_myhits(urlid integer primary key, score float, ch float)')
# Clear out current auth_myhitsconvergence tables if already existed
self.con.execute('drop table if exists auth_myhitsconvergence')
# Create auth_myhitsconvergence tables
self.con.execute('create table auth_myhitsconvergence(residu float)')
# Clear out current hub_myhitsconvergence tables if already existed
self.con.execute('drop table if exists hub_myhitsconvergence')
# Create hub_myhitsconvergence tables
self.con.execute('create table hub_myhitsconvergence(residu float)')
# calculate number of pages in database
n = self.con.execute('select count(*) from urllist').fetchone()[0]
# initial value
intval = 1.0/n
# Initialize every url with a authority and hub scores
for (urlid,) in self.con.execute('select rowid from urllist'):
self.con.execute("insert into auth_myhits(urlid, score, ca) values(%i, %f, %f)" %(urlid, intval, 1.0))
self.con.execute("insert into hub_myhits(urlid, score, ch) values(%i, %f, %f)" %(urlid, intval, 1.0))
self.dbcommit()
# Find number of inlinks and outlinks of each url
for (urlid,) in self.con.execute('select rowid from urllist'):
inlinks = self.con.execute('select count(*) from link where toid = %i' %urlid).fetchone()[0]
outlinks = self.con.execute('select count(*) from link where fromid = %i' %urlid).fetchone()[0]
if inlinks > outlinks: p = 1
elif inlinks < outlinks: p = -1
else: p = 0
# Calculate constant for authority part
ca = abs (inlinks - outlinks)**p
ca *= float(inlinks)*1/(inlinks + outlinks)
if ca == 0.0: ca = 0.00001
# Calculate constant for hub part
ch = abs (inlinks - outlinks)**(-p)
ch *= float(outlinks)*1/(inlinks + outlinks)
if ch == 0.0: ch = 0.00001
self.con.execute('update auth_myhits set ca = %f where urlid = %i' %(ca, urlid))
self.con.execute('update hub_myhits set ch = %f where urlid = %i' %(ch, urlid))
self.dbcommit()
auth_currentscores = zeros(n)
hub_currentscores = zeros(n)
# Start the scores calculation process
for i in range(iter):
agg_authScore = 0.0
agg_hubScore = 0.0
print "Iteration %i" %(i)
# Authority part
auth_prevscores = deepcopy(auth_currentscores)
for (urlid,) in self.con.execute('select rowid from urllist'):
authScore = 0.0
# Loop through all the pages that link to this one
for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
# Get the hub scores and ch of the linker
linker_hubScore = self.con.execute('select score from hub_myhits where urlid = %i' %linker).fetchone()[0]
linker_ch = self.con.execute ('select ch from hub_myhits where urlid = %i' %linker).fetchone()[0]
authScore += linker_hubScore*linker_ch
self.con.execute ('update auth_myhits set score = %f where urlid = %i' %(authScore, urlid))
agg_authScore += authScore
for (urlid,) in self.con.execute('select rowid from urllist'):
NauthScore = self.con.execute('select score from auth_myhits where urlid = %i' %urlid).fetchone()[0]
normalized_authScore = NauthScore*1/agg_authScore
auth_currentscores[urlid-1] = normalized_authScore
self.con.execute('update auth_myhits set score = %f where urlid = %i' %(normalized_authScore, urlid))
self.dbcommit()
# Hub part
hub_prevscores = deepcopy(hub_currentscores)
for (urlid,) in self.con.execute('select rowid from urllist'):
hubScore = 0.0
# Loop through all the pages that being linked by this one
for (linker,) in self.con.execute('select distinct toid from link where fromid = %i' %urlid):
# Get the authority scores and ca of the linker
linker_authScore = self.con.execute('select score from auth_myhits where urlid = %i' %linker).fetchone()[0]
linker_ca = self.con.execute('select ca from auth_myhits where urlid = %i' %linker).fetchone()[0]
hubScore += linker_authScore*linker_ca
self.con.execute('update hub_myhits set score = %f where urlid = %i' %(hubScore, urlid))
agg_hubScore += hubScore
for (urlid,) in self.con.execute('select rowid from urllist'):
NhubScore = self.con.execute('select score from hub_myhits where urlid = %i' %urlid).fetchone()[0]
normalized_hubScore = NhubScore*1/agg_hubScore
hub_currentscores[urlid-1] = normalized_hubScore
self.con.execute('update hub_myhits set score=%f where urlid = %i' %(normalized_hubScore, urlid))
auth_res = sum(abs(auth_currentscores-auth_prevscores))
self.con.execute('insert into auth_myhitsconvergence(residu) values(%f)'%auth_res)
hub_res = sum(abs(hub_currentscores-hub_prevscores))
self.con.execute('insert into hub_myhitsconvergence(residu) values(%f)'%hub_res)
self.dbcommit()
def calculateall(self):
self.calculatepagerank(iter)
#self.calculatemypagerank(iter)
self.calculatehits(iter)
self.calculatemyhits(iter)
#self.calculatelength()
def plotconvergence(self, name = 'http://www.britannica.com/blogs/'):
x = arange(iter)
prscores = []
for (residu, ) in self.con.execute('select residu from prconvergence'):
prscores.append(residu)
authscores = []
for (residu, ) in self.con.execute('select residu from auth_hitsconvergence'):
authscores.append(residu)
myauthscores = []
for (residu, ) in self.con.execute('select residu from auth_myhitsconvergence'):
myauthscores.append(residu)
semilogy(x, prscores, 'b:s', x, authscores, 'r:o', x, myauthscores, 'g--d')
xlabel('Iterations')
ylabel('Error (log scale)')
title('Convergence Rate (' + name + ')')
legend(('PageRank','HITS (authority)','modified HITS (authority)'))
grid(True)
show()
def fact(self):
# Number of nodes in the network
nodeNum = self.con.execute('select count(*) from urllist').fetchone()[0]
# Non-zero entries
nonZero = 0
for (id,) in self.con.execute('select linkid from linkwords'):
nonZero += 1
# SIMILARITY MEASURES
# Calculate Array of ranking vector
prvector = [] # PageRank vector
for (score, ) in self.con.execute('select score from pagerank'):
prvector.append(score)
authvector = [] # HITS vector
for (score, ) in self.con.execute('select score from auth_hits'):
authvector.append(score)
myauthvector = [] # modified HITS vector
for (score, ) in self.con.execute('select score from auth_myhits'):
myauthvector.append(score)
inboundlink = []
for (urlid,) in self.con.execute('select rowid from urllist'):
count = 0
for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
count += 1
inboundlink.append(count) # Inboundlink vector
# Change lists into arrays
prvector = array(prvector)
authvector = array(authvector)
myauthvector = array(myauthvector)
inboundlink = array(inboundlink)
###################################
# Similarity between PageRank and authority part of HITS
# Cosine criterion
num = sum(prvector*authvector)
den = sqrt(sum(prvector*prvector))*sqrt(sum(authvector*authvector))
simCosPH = float(num)/den
# Spearman rank order correlation coefficient criterion
spPH = stats.spearmanr(prvector, authvector)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktPH = stats.kendalltau(prvector, authvector)[0]
# Similarity between PageRank and authority part of modified HITS
# Cosine criterion
num = sum(prvector*myauthvector)
den = sqrt(sum(prvector*prvector))*sqrt(sum(myauthvector*myauthvector))
simCosPmH = float(num)/den
# Spearman rank order correlation coefficient criterion
spPmH = stats.spearmanr(prvector, myauthvector)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktPmH = stats.kendalltau(prvector, myauthvector)[0]
# Similarity between PageRank and Inboundlink
# Cosine criterion
num = sum(prvector*inboundlink)
den = sqrt(sum(prvector*prvector))*sqrt(sum(inboundlink*inboundlink))
simCosPI = float(num)/den
# Spearman rank order correlation coefficient criterion
spPI = stats.spearmanr(prvector, inboundlink)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktPI = stats.kendalltau(prvector, inboundlink)[0]
###################################
# Similarity between authority part of HITS and PageRank
# Cosine criterion
simCosHP = simCosPH
# Spearman rank order correlation coefficient criterion
spHP = spPH
# Kendall's Tau rank order correlation coefficient criterion
ktHP = ktPH
# Similarity between authority part of HITS and authority part of modified HITS
# Cosine criterion
num = sum(authvector*myauthvector)
den = sqrt(sum(authvector*authvector))*sqrt(sum(myauthvector*myauthvector))
simCosHmH = float(num)/den
# Spearman rank order correlation coefficient criterion
spHmH = stats.spearmanr(authvector, myauthvector)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktHmH = stats.kendalltau(authvector, myauthvector)[0]
# Similarity between authority part of HITS and Inboundlink
# Cosine criterion
num = sum(authvector*inboundlink)
den = sqrt(sum(authvector*authvector))*sqrt(sum(inboundlink*inboundlink))
simCosHI = float(num)/den
# Spearman rank order correlation coefficient criterion
spHI = stats.spearmanr(authvector, inboundlink)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktHI = stats.kendalltau(authvector, inboundlink)[0]
###################################
###################################
# Similarity between authority part of modified HITS and PageRank
# Cosine criterion
simCosmHP = simCosPmH
# Spearman rank order correlation coefficient criterion
spmHP = spPmH
# Kendall's Tau rank order correlation coefficient criterion
ktmHP = ktPmH
# Similarity between authority part of modified HITS and authority part of HITS
# Cosine criterion
simCosmHH = simCosHmH
# Spearman rank order correlation coefficient criterion
spmHH = spHmH
# Kendall's Tau rank order correlation coefficient criterion
ktmHH = ktHmH
# Similarity between authority part of modified HITS and Inboundlink
# Cosine criterion
num = sum(myauthvector*inboundlink)
den = sqrt(sum(myauthvector*myauthvector))*sqrt(sum(inboundlink*inboundlink))
simCosmHI = float(num)/den
# Spearman rank order correlation coefficient criterion
spmHI = stats.spearmanr(myauthvector, inboundlink)[0]
# Kendall's Tau rank order correlation coefficient criterion
ktmHI = stats.kendalltau(myauthvector, inboundlink)[0]
###################################
###################################
# Similarity between Inboundlink and PageRank
# Cosine criterion
simCosIP = simCosPI
# Spearman rank order correlation coefficient criterion
spIP = spPI
# Kendall's Tau rank order correlation coefficient criterion
ktIP = ktPI
# Similarity between Inboundlink and authority part of HITS
# Cosine criterion
simCosIH = simCosHI
# Spearman rank order correlation coefficient criterion
spIH = spHI
# Kendall's Tau rank order correlation coefficient criterion
ktIH = ktHI
# Similarity between Inboundlink and authority part of modified HITS
# Cosine criterion
simCosImH = simCosmHI
# Spearman rank order correlation coefficient criterion
spImH = spmHI
# Kendall's Tau rank order correlation coefficient criterion
ktImH = ktmHI
###################################
print '-----------------------------------------------'
print 'nodeNum = %i, nonZero = %i'%(nodeNum, nonZero)
print '-----------------------------------------------'
print 'SIMILARITY MEASURE, COSINE CRITERION'
print 'STANDARD MEASURE: PAGERANK'
print 'simCosPH = %f, simCosPmH = %f, simCosPI = %f' %(simCosPH, simCosPmH, simCosPI)
print ''
print 'STANDARD MEASURE: HITS'
print 'simCosHP = %f, simCosHmH = %f, simCosHI = %f' %(simCosHP, simCosHmH, simCosHI)
print ''
print 'STANDARD MEASURE: modified HITS'
print 'simCosmHP = %f, simCosmHH = %f, simCosmHI = %f' %(simCosmHP, simCosmHH, simCosmHI)
print ''
print 'STANDARD MEASURE: Inboundlink'
print 'simCosIP = %f, simCosIH = %f, simCosImH = %f' %(simCosIP, simCosIH, simCosImH)
print '----------------------------------------------'
print 'SIMILARITY MEASURE, SPEARMAN CRITERION'
print 'STANDARD MEASURE: PAGERANK'
print 'spPH = %f, spPmH = %f, spPI = %f' %(spPH, spPmH, spPI)
print ''
print 'STANDARD MEASURE: HITS'
print 'spHP = %f, spHmH = %f, spHI = %f' %(spHP, spHmH, spHI)
print ''
print 'STANDARD MEASURE: modified HITS'
print 'spmHP = %f, spmHH = %f, spmHI = %f' %(spmHP, spmHH, spmHI)
print ''
print 'STANDARD MEASURE: Inboundlink'
print 'spIP = %f, spIH = %f, spImH = %f' %(spIP, spIH, spImH)
print '----------------------------------------------'
print "SIMILARITY MEASURE, KENDALL'S TAU CRITERION"
print 'STANDARD MEASURE: PAGERANK'
print 'ktPH = %f, ktPmH = %f, ktPI = %f' %(ktPH, ktPmH, ktPI)
print ''
print 'STANDARD MEASURE: HITS'
print 'ktHP = %f, ktHmH = %f, ktHI = %f' %(ktHP, ktHmH, ktHI)
print ''
print 'STANDARD MEASURE: modified HITS'
print 'ktmHP = %f, ktmHH = %f, ktmHI = %f' %(ktmHP, ktmHH, ktmHI)
print ''
print 'STANDARD MEASURE: Inboundlink'
print 'ktIP = %f, ktIH = %f, ktImH = %f' %(ktIP, ktIH, ktImH)
f = open('Similarity.txt','w')
f.write('-----------------------------------------------\n')
f.write('nodeNum = %i, nonZero = %i\n'%(nodeNum, nonZero))
f.write('-----------------------------------------------\n')
f.write('SIMILARITY MEASURE, COSINE CRITERION\n')
f.write('STANDARD MEASURE: PAGERANK\n')
f.write('simCosPH = %f, simCosPmH = %f, simCosPI = %f\n' %(simCosPH, simCosPmH, simCosPI))
f.write(' \n')
f.write('STANDARD MEASURE: HITS\n')
f.write('simCosHP = %f, simCosHmH = %f, simCosHI = %f\n' %(simCosHP, simCosHmH, simCosHI))
f.write(' \n')
f.write('STANDARD MEASURE: modified HITS\n')
f.write('simCosmHP = %f, simCosmHH = %f, simCosmHI = %f\n' %(simCosmHP, simCosmHH, simCosmHI))
f.write(' \n')
f.write('STANDARD MEASURE: Inboundlink\n')
f.write('simCosIP = %f, simCosIH = %f, simCosImH = %f\n' %(simCosIP, simCosIH, simCosImH))
f.write('----------------------------------------------\n')
f.write('SIMILARITY MEASURE, SPEARMAN CRITERION\n')
f.write('STANDARD MEASURE: PAGERANK\n')
f.write('spPH = %f, spPmH = %f, spPI = %f\n' %(spPH, spPmH, spPI))
f.write(' \n')
f.write('STANDARD MEASURE: HITS\n')
f.write('spHP = %f, spHmH = %f, spHI = %f\n' %(spHP, spHmH, spHI))
f.write(' \n')
f.write('STANDARD MEASURE: modified HITS\n')
f.write('spmHP = %f, spmHH = %f, spmHI = %f\n' %(spmHP, spmHH, spmHI))
f.write(' \n')
f.write('STANDARD MEASURE: Inboundlink\n')
f.write('spIP = %f, spIH = %f, spImH = %f\n' %(spIP, spIH, spImH))
f.write('----------------------------------------------\n')
f.write("SIMILARITY MEASURE, KENDALL'S TAU CRITERION\n")
f.write('STANDARD MEASURE: PAGERANK\n')
f.write('ktPH = %f, ktPmH = %f, ktPI = %f\n' %(ktPH, ktPmH, ktPI))
f.write(' \n')
f.write('STANDARD MEASURE: HITS\n')
f.write('ktHP = %f, ktHmH = %f, ktHI = %f\n' %(ktHP, ktHmH, ktHI))
f.write(' \n')
f.write('STANDARD MEASURE: modified HITS\n')
f.write('ktmHP = %f, ktmHH = %f, ktmHI = %f\n' %(ktmHP, ktmHH, ktmHI))
f.write(' \n')
f.write('STANDARD MEASURE: Inboundlink\n')
f.write('ktIP = %f, ktIH = %f, ktImH = %f\n' %(ktIP, ktIH, ktImH))
f.close()
def backbutton(self):
word = 'backbutton'
wordid = self.getentryid('wordlist', 'word', word)
for (urlid,) in self.con.execute('select rowid from urllist'):
if self.con.execute('select toid from link where fromid = %i' %urlid).fetchone() == None:
for (backlinker,) in self.con.execute('select fromid from link where toid = %i' %urlid):
if backlinker == urlid: continue
cur = self.con.execute("insert into link(fromid, toid) values(%i, %i)" %(urlid, backlinker))
linkid = cur.lastrowid
self.con.execute("insert into linkwords(linkid, wordid) values(%i, %i)" %(linkid, wordid))
else: continue
self.dbcommit()
def danglingnode(self):
t = 1
# Clear out danglingnode tables
self.con.execute('drop table if exists dnode')
# Create danglingnode table
self.con.execute('create table dnode(id integer primary key, val integer)')
# Initialize every item with False value
self.con.execute('insert into dnode select rowid, 0 from urllist')
self.dbcommit()
# Find dangling nodes
for (urlid,) in self.con.execute('select rowid from urllist'):
if self.con.execute('select toid from link where fromid = %i' %urlid).fetchone() == None:
self.con.execute('update dnode set val = %i where id=%i'%(t,urlid))
self.dbcommit()
### CLASS CRAWLER ENDS HERE###
#QUERY INDEPENDENT SCORES (LINK SCORES) END HERE#
import pythinsearch14
page = "data"
c=pythinsearch14.crawler('')
c.crawl(page)
#searcher=pythinsearch14.searcher('wikipedia.db')
#wordids,urls=searcher.query('python','qi')