For many years now, I've been chasing the funky data model, a database where you can change your mind about the structure of the data on the fly, without losing all chance of ever getting the data out again. Neo4J promises that way of working.
So, before I get all fancy, I'm trying to make a painfully simple python crawler that will go to a URL, find a few pages linked from that URL, add them to the database and then create relationships between the pages based on their links. Simple. Ahem.
This code doesn't quite work at the moment... but it's almost there....
import neo4j #See http://components.neo4j.org/neo4j.py/
import urllib2, traceback, re, urlparse, socket, sys
socket.setdefaulttimeout(4) #seconds
'''This is my attempt to begin to make proper classes and functions to talk to the neo4j database, it's not meant to be fancy'''
def setup():
######## DO SOME SETUP STUFF ############
db = neo4j.GraphDatabase("crawler_example_db")
with db.transaction:
#How do we delete an index... which might have old nodes in? This doesn't work...any ideas... how do I empty an index, should I? Does deleting a node delete its refererence from the index
try:
for ref in db.index('pages'):
ref.delete()
except Exception, err:
print err
pages = db.index("pages", create=True) # create an index called 'pages'
print "Index created"
return db, pages
############# UTILITY FUNCTIONS #############
def get_links( data, url=''):
'I know I should use BeautifulSoup or lxml'
links = [ ]
try:
found_links = re.findall(' href="?([^\s^"\'#]+)', data)
for link in found_links:
#print link
link = urlparse.urljoin(url, link)
link = link.replace("/..", "") #fix relative links
link = link.replace("'", "")
if link not in links and link[:7] == 'http://' :#avoid mailtos
links.append( link )
except Exception, err:
print err
return links
def wget(url):
try:
handle = urllib2.urlopen(url)
if 'text/html' in handle.headers['content-type']:
data = unicode(handle.read(), errors='ignore')
return data
except Exception, e:
print "Error wgetting", e, url
#print traceback.print_exc()
return None
def delete_all_pages():
with db.transaction:
try:
for node in db.node:
id = node.id
node.delete( )
print id, "deleted"
except Exception, err:
print err
pass # fails on last iteration?
print "All pages deleted!"
def create_page(db, pages, url, code=200 , follow=True):
'''this creates a page node then adds it to the "pages" index, if it's there already it'll get it'''
# Get the actual page data from the web...
data = wget( url ) #get the actual html
if not data:
return None
else:
data_len = len(data)
page_node = pages[url] # does this page exist yet?
if not page_node:
page_node = db.node(url=url, code=code, data=data, data_len=data_len) # create a page
pages[ url ] = page_node # Add to index
print "Created:" , url
else:
print "Exists already:" , url
#Now create a page for every link in the page
if follow == True:
print "\tfollowing links"
i = 0
links = get_links(data, url)
print len( links ) , "found"
if len(links) > 20:
links = links[:19]
for link in links:
print "\tcreating:", link
create_page(db, pages, link, follow=False)
return page_node
def get_one(url):
'given a url with return a node obj'
with db.transaction:
try:
node = pages [ url ]
if node == None:
print "Node is none!"
return node
except Exception, err:
print err
return None
def list_all_pages(db):
# Just iterate through the pages to make sure the data in in there...
print "Listing all pages..."
with db.transaction:
for node in db.node:
try:
print node
#print node['data'][:150], "..."
print
except Exception, err:
print err
print "...done listing!"
def delete_one( url ):
''
with db.transaction:
try:
node = pages[ url ]
if node:
node.delete()
print "Node with id=", node.id, "deleted"
except Exception, err:
print "Node probably not found:", err
print dir(node) # let's have a look
def find_links_between_pages(db):
#'Just iterate through the pages to make sure the data in in there...'
with db.transaction:
try:
print "Linking all pages..."
for node in db.node:
try:
print str(node), node['url'], node['data_len']
links = get_links( node['data'], node['url'])
for link in links:
try:
#look to see if a node with that url exists
other_node = get_one( link )
if not other_node:
pass
else:
other_node.links_to( node )
except Exception, err:
print err
except Exception, err:
print err
print
except Exception, err:
pass # fails on last iteration?
print err
if __name__ == '__main__':
#### LET'S GET STARTED ######
db, pages = setup( ) # I can't connect on load, because then I can't reload(this_module) because it is already connected
try:
sys.argv[1]
url = sys.argv[1]
except:
print "No url passed, using 'http://diveintopython.org/'"
url = 'http://diveintopython.org/'
print "Starting:", url
######## DELETE ALL THE PAGES WE HAVE SO FAR ############
# Avoiding this because of ...
#jpype._jexception.RuntimeExceptionPyRaisable: org.neo4j.graphdb.NotFoundException: Node[0]
#... errors later...
#print "Deleting existing pages..."
#delete_all_pages( ) #we may want to add new data to each page... forget this for now
######## ADD SOME ACTUAL PAGES #################
#print "Creating some pages "
with db.transaction:
create_page( db, pages, url, 200 ) #Also fetches some pages linked from this page
######## NOW GET SOME NODES OUT ###############
print
print "Has our data made it to the database? Let's see..."
# Do some fishing for nodes...
with db.transaction:
try:
node = get_one( url )
print '\tget one:' + str( node ) , "... oh yes!"
print "\tid:", node.id , "url:", node['url'], "data-length:", node['data_len']
print
#This should fail
#print "This SHOULD fail.."
#node = get_one( 'http://www.pythonware.com/daily/' )
#print 'get one:' + str( node )
#print "id:", node.id , "url:", node['url'], "data length:", node['data_len']
#print
except Exception, err:
print "Probably not a node with that URL"
print err
print
db.shutdown( )
db, pages = setup( )
######### TRY TO ITERATE ALL PAGES FETCHED ################
list_all_pages( db )
######### TRY TO DELETE ONE ################
#delete_one( url )
# Now let's see if it has gone
#list_all_pages( ) #or maybe later
######### LET'S LOOK FOR RELATIONSHIPS BETWEEN PAGES ################
print "Doing linking pages..."
find_links_between_pages( ) #goes to every page, looking for other pages
print "shutting down, saving all the changes, etc"
db.shutdown()
Subscribe to:
Post Comments (Atom)
You have to delete the index entry yourself when you delete a node. See
ReplyDeletehttp://wiki.neo4j.org/content/Indexing_with_IndexService
If all you want is something where you can change the data on the fly, than any of the NoSQL databases are going to do this fine (e.g. a simple key-value store like BigTable) (or even a de-normalised mysql with everything a string :) ), but that's not really what you are trying to do (as you know). The real gain here is from neo4j being a graphdb and what you want to model also being a graph.
ReplyDeleteand remind me to show you some twisted based crawler code when you are ready to get lots of data in
ReplyDeleteI wish the examples were in python too.. I'm a bit lost in Java-world.
ReplyDeleteThanks for the twisted offer, I attempted twisted and couldn't get it working. I imagine it will be all lovely and threaded (a lot quicker) but with a database that can only be open at one time, I also imagine the threading might break neo4j ?¿ dunno...
Neo4j itself shouldn't have issues with threading, so that would come down to how Jython/CPython behaves (of which I have no knowledge).
ReplyDeleteNote however that transactions are thread confined and uncommitted modifications will only be visible inside the same transactional context.
Thanks Anders... maybe I should give threaded crawling a whirl...
ReplyDelete