So, this sort of works. The team at neo4j have made the iteration work for me... Usage:
>>python2.6 neo.py http://www.wherever.com
... and what it does it go to that web site and grabs a few links, following them, adding the pages and their data to a neo4j graph database.
import neo4j #See http://components.neo4j.org/neo4j.py/
import urllib2, traceback, re, urlparse, socket, sys, random, time
socket.setdefaulttimeout(4) #seconds
'''This is my attempt to begin to make proper classes and functions to talk to the neo4j database, it's not meant to be fancy or clever, I just want to be able to CRUD ok-ish'''
try:
db = neo4j.GraphDatabase("crawler_example3_db")
with db.transaction:
#How do we delete an index... which might have old nodes in? This doesn't work...any ideas... how do I empty an index, should I? Does deleting a node delete its refererence from the index
'''try:
for ref in db.index('pages'):
ref.delete()
except Exception, err:
print err
'''
pages = db.index("pages", create=True) # create an index called 'pages'
#print "Index created"
#return db, pages
except Exception, err:
print err
#print "db:", db
############# UTILITY FUNCTIONS #############
def get_links( data, url=''):
'I know I should use BeautifulSoup or lxml, but for simplicity it's a regex..ha'
links = [ ]
try:
found_links = re.findall(' href="?([^\s^"\'#]+)', data)
for link in found_links:
#print link
link = urlparse.urljoin(url, link)
link = link.replace("/..", "") #fix relative links
link = link.replace("'", "")
if link not in links and link[:7] == 'http://' :#avoid mailtos
links.append( link )
except Exception, err:
print err
return links
def wget(url):
try:
handle = urllib2.urlopen(url)
if 'text/html' in handle.headers['content-type']:
data = unicode(handle.read(), errors='ignore')
return data
except Exception, e:
print "Error wgetting", e, url
#print traceback.print_exc()
return None
def delete_all_pages():
with db.transaction:
try:
for node in db.node:
id = node.id
node.delete( )
print id, "deleted"
except Exception, err:
print err
pass # fails on last iteration?
print "All pages deleted!"
def create_page( url, code=200 , follow=True):
'''this creates a page node then adds it to the "pages" index, if it's there already it'll get it'''
# Get the actual page data from the web...
data = wget( url ) #get the actual html
if not data:
return None
print "Boo!", url
else:
data_len = len(data)
page_node = pages[url] # does this page exist yet?
if not page_node:
page_node = db.node(url=url, code=code, data=str(data), data_len=data_len) # create a page
pages[ url ] = page_node # Add to index
print "Created:" , url
else:
print "Exists already:" , url
#Now create a page for every link in the page
if follow == True:
print "\tfollowing links"
i = 0
links = get_links(data, url)
print len( links ) , "found"
if len(links) > 20:
links = links[:19]
for link in links:
print "\tcreating:", link
create_page( link, follow=False)
return page_node
def get_one(url):
'given a url with return a node obj'
with db.transaction:
try:
node = pages [ url ]
if node == None:
print "Node is none! Creating..."
create_page( url, code=200 , follow=False)
return node
except Exception, err:
print err
return None
def list_all_pages( ):
# Just iterate through the pages to make sure the data in in there...
print "Listing all pages..."
with db.transaction:
for node in db.node:
try:
print node
#print node['data'][:150], "..."
print
except Exception, err:
print err
print "...done listing!"
def delete_one( url ):
''
with db.transaction:
try:
node = pages[ url ]
if node:
node.delete( )
print "Node with id=", node.id, "deleted"
#delete from index
del pages[url]
except Exception, err:
print "Node probably not found:", err
print dir(node) # let's have a look
def find_links_between_pages( ):
#'Just iterate through the pages to make sure the data in in there...'
print "Linking all pages..."
with db.transaction:
try:
for node in db.node:
try:
print str(node), node['url'], node['data_len']
links = get_links( node['data'], node['url'])
for link in links:
try:
print link
#look to see if a node with that url exists, if it doesn't it's created...
other_node = get_one( link )
if not other_node:
pass
else:
other_node.links_to( node )
except Exception, err:
print err
print
except Exception, err:
print err
print
except Exception, err:
pass # fails on last iteration?
print err
class Backlink(neo4j.Traversal):
types = [neo4j.Outgoing.links_to]
order = neo4j.DEPTH_FIRST
stop = neo4j.STOP_AT_END_OF_GRAPH
def isReturnable(self, pos):
if pos.is_start: return False
else:return pos.last_relationship.type == 'links_to'
if __name__ == '__main__':
#### LET'S GET STARTED ######
try:
sys.argv[1]
url = sys.argv[1]
except:
print "No url passed, using 'http://diveintopython.org/'"
url = 'http://diveintopython.org/'
print "Starting:", url
######## DELETE ALL THE PAGES WE HAVE SO FAR ############
# Avoiding this because of ...
#jpype._jexception.RuntimeExceptionPyRaisable: org.neo4j.graphdb.NotFoundException: Node[0]
#... errors later...
#print "Deleting existing pages..."
#delete_all_pages( ) #we may want to add new data to each page... forget this for now
######## ADD SOME ACTUAL PAGES #################
#print "Creating some pages "
with db.transaction:
create_page( url, 200, follow=True ) #Also fetches some pages linked from this page
######## NOW GET SOME NODES OUT ###############
print
print "Has our data made it to the database? Let's see..."
# Do some fishing for nodes...
with db.transaction:
try:
node = get_one( url )
print '\tget one:' + str( node ) , "... oh yes!"
print "\tid:", node.id , "url:", node['url'], "data-length:", node['data_len']
print
#This should fail
#print "This SHOULD fail.."
#node = get_one( 'http://www.pythonware.com/daily/' )
#print 'get one:' + str( node )
#print "id:", node.id , "url:", node['url'], "data length:", node['data_len']
#print
except Exception, err:
print "Probably not a node with that URL"
print err
print
######### TRY TO ITERATE ALL PAGES FETCHED ################
list_all_pages( )
######### TRY TO DELETE ONE ################
#delete_one( url )
# Now let's see if it has gone
#list_all_pages( ) #or maybe later
######### LET'S LOOK FOR RELATIONSHIPS BETWEEN PAGES ################
print
print "Doing linking pages..."
find_links_between_pages( ) #goes to every page, looking for other pages in the database
print
print "shutting down, saving all the changes, etc"
db.shutdown()
No comments:
Post a Comment