Tuesday 3 August 2010

Working Neo4J / Python code


So, this sort of works. The team at neo4j have made the iteration work for me... Usage:

>>python2.6 neo.py  http://www.wherever.com

... and what it does it go to that web site and grabs a few links, following them, adding the pages and their data to a neo4j graph database.

import neo4j #See http://components.neo4j.org/neo4j.py/
import urllib2, traceback, re, urlparse, socket, sys, random, time

socket.setdefaulttimeout(4) #seconds

'''This is my attempt to begin to make proper classes and functions to talk to the neo4j database, it's not meant to be fancy or clever, I just want to be able to CRUD ok-ish'''


try:
 db = neo4j.GraphDatabase("crawler_example3_db")
 with db.transaction:
  #How do we delete an index... which might have old nodes in? This doesn't work...any ideas... how do I empty an index, should I? Does deleting a node delete its refererence from the index
  '''try:
   for ref in db.index('pages'):
    ref.delete()
  except Exception, err:
   print err
  '''
  pages = db.index("pages", create=True) # create an index called 'pages'
  #print "Index created"
 #return db, pages
except Exception, err:
 print err
 #print "db:", db


#############     UTILITY FUNCTIONS         #############

def get_links( data, url=''):
 'I know I should use BeautifulSoup or lxml, but for simplicity it's a regex..ha'
 links = [ ]
 try:
  found_links = re.findall(' href="?([^\s^"\'#]+)', data)
  
  for link in found_links:
   #print link
   link = urlparse.urljoin(url, link)
   link = link.replace("/..", "") #fix relative links
   link = link.replace("'", "")
   if link not in links and link[:7] == 'http://' :#avoid mailtos
    links.append( link )
 except Exception, err:
  print err
  
 return links
 
def wget(url):
 try:
  handle = urllib2.urlopen(url)
  if 'text/html' in handle.headers['content-type']:
   data = unicode(handle.read(),  errors='ignore')
   return data
 except Exception, e:
  print "Error wgetting", e, url
  #print traceback.print_exc()
 return None

def delete_all_pages():
 with db.transaction:
  try:
   for node in db.node:
    id  = node.id
    node.delete( )
    print id, "deleted"
  except Exception, err:
   print err
   pass # fails on last iteration?
 print "All pages deleted!"


def create_page( url, code=200 , follow=True):
 '''this creates a page node then adds it to the "pages" index, if it's there already it'll get it'''
 
 # Get the actual page data from the web...
 data = wget( url ) #get the actual html
 if not data:
  return None
  print "Boo!", url
 else:
  data_len = len(data)
  page_node = pages[url] # does this page exist yet?
  if not page_node:
   page_node = db.node(url=url, code=code, data=str(data), data_len=data_len) # create a page
   pages[ url ] = page_node # Add to index
   print "Created:" , url
  else:
   print "Exists already:" , url
   
  #Now create a page for every link in the page
  if follow == True:
   print "\tfollowing links"
   i = 0
   links = get_links(data, url)
   print len( links ) , "found"
   if len(links) > 20:
    links = links[:19]
   for link in links:
    print "\tcreating:",  link
    create_page( link, follow=False)
  return page_node

def get_one(url):
 'given a url with return a node obj'
 with db.transaction:
  try:
   node = pages [ url ]
   if node == None:
    print "Node is none! Creating..."
    create_page( url, code=200 , follow=False)
   return node
  except Exception, err:
   print err
   
 return None
 

def list_all_pages( ):
  # Just iterate through the pages to make sure the data in in there...
  print "Listing all pages..."
 with db.transaction: 
  for node in db.node:
   try:
    print node
    #print node['data'][:150], "..."
    print 
   except Exception, err:
    print err
 print "...done listing!"


def delete_one( url ):
 ''
 with db.transaction:
  try:
   node = pages[ url ]
   if node:
    node.delete( )
    print "Node with id=", node.id, "deleted"
    #delete from index
    del pages[url]
  except Exception, err:
   print "Node probably not found:", err
   print dir(node) # let's have a look



def find_links_between_pages( ):
 #'Just iterate through the pages to make sure the data in in there...'
  print "Linking all pages..."
 with db.transaction:
  try: 
   for node in db.node:
    try:
     print str(node), node['url'], node['data_len']
     links = get_links( node['data'], node['url'])
     for link in links:
      try:
       print link
       #look to see if a node with that url exists, if it doesn't it's created...
       other_node = get_one( link )
       if not other_node:
        pass
       else:
        other_node.links_to( node )
      except Exception, err:
       print err
     print
    except Exception, err:
     print err
    print 
  except Exception, err:
   pass # fails on last iteration?
   print err


class Backlink(neo4j.Traversal):
 types = [neo4j.Outgoing.links_to]
 order = neo4j.DEPTH_FIRST
 stop = neo4j.STOP_AT_END_OF_GRAPH
  
 def isReturnable(self, pos):
  if pos.is_start: return False
  else:return pos.last_relationship.type == 'links_to'


if __name__ == '__main__':

 #### LET'S GET STARTED ######
try:
  sys.argv[1]
  url = sys.argv[1]
 except:
  print "No url passed, using 'http://diveintopython.org/'"
  url = 'http://diveintopython.org/'
  
 print "Starting:", url
 
 ######## DELETE ALL THE PAGES WE HAVE SO FAR ############
 # Avoiding this because of ...
 #jpype._jexception.RuntimeExceptionPyRaisable: org.neo4j.graphdb.NotFoundException: Node[0] 
 #... errors later...
 #print "Deleting existing pages..."
 #delete_all_pages( ) #we may want to add new data to each page... forget this for now
 
 ########        ADD SOME ACTUAL PAGES        #################
 #print "Creating some pages "
 with db.transaction:
  create_page(  url, 200, follow=True ) #Also fetches some pages linked from this page
 
 ########    NOW GET SOME NODES OUT     ############### 
 print
 print "Has our data made it to the database? Let's see..."
 # Do some fishing for nodes...
 with db.transaction:
  try:
   node = get_one( url )
   print '\tget one:' + str( node ) , "... oh yes!"
   print "\tid:", node.id , "url:", node['url'], "data-length:", node['data_len']
   print 
   
   #This should fail
   #print "This SHOULD fail.."
   #node = get_one( 'http://www.pythonware.com/daily/' )
   #print 'get one:' + str( node ) 
   #print "id:", node.id , "url:", node['url'], "data length:", node['data_len']
   #print 
   
  except Exception, err:
   print "Probably not a node with that URL"
   print err 
 print
 
 #########  TRY TO ITERATE ALL PAGES FETCHED ################
 list_all_pages(  )
 
 #########    TRY TO DELETE ONE ################
 #delete_one( url ) 
 # Now let's see if it has gone
 #list_all_pages( ) #or maybe later
 
 ######### LET'S LOOK FOR RELATIONSHIPS BETWEEN PAGES ################
 print
 print "Doing linking pages..."
 find_links_between_pages( ) #goes to every page, looking for other pages in the database
 
 print
 print "shutting down, saving all the changes, etc"
 db.shutdown()

No comments:

Post a Comment