Friday, 6 August 2010

Neo4J + Python Crawler + Open Calais + Gephi



So, today after having spent the morning getting our LDAP server to talk to our Cyn.in instance (it worked!) I thought I'd have a bash at this. The plan was/is to create a crawler that gets a web page, finds the pages that it links to (including Word or PDF files),  pumps any data found at the Open Calais API and saves the semantic Entities returned into a Neo4J database then go and look at it with the Gephi visualising tool ( see previous posts ).

This is all kinda new to me, I have no idea if what I'm doing makes sense, but I hope that once I can "look" at data, I'll be able to figure out a way of pruning it into something usable.

The (ropey) code is here and the visualisation is shown above. I have no idea if this will be "traversable" yet, but it kind of proves to me that it's doable. Ideally I want to crawl a given pile of pages into one big soup and then get from page B to page X and "discover" the shortest route between them...

How hard can it be :-)





5    import urllib2, urllib,  traceback, re, urlparse, socket, sys, random, time 
6    from pprint import pprint 
7    import codecs, sys 
8    from calais import Calais      #http://code.google.com/p/python-calais/ 
9    from scraper import fetch     #http://zesty.ca/python/scrape.py 
10    
11   streamWriter = codecs.lookup('utf-8')[-1] 
12   sys.stdout = streamWriter(sys.stdout) 
13    
14   API_KEY = 'YOUR_KEY_HERE' 
15   calaisapi = Calais(API_KEY, submitter="python-calais demo") 
16    
17   def analyze(url=None): 
18       result = calaisapi.analyze_url( url ) 
19       return result 
20    
21   def neo_result(result, url): 
22       import neo4j 
23    
24       db = neo4j.GraphDatabase( "simple_neo_calais_test" ) 
25    
26       result.print_summary( ) 
27        
28       with db.transaction: 
29           # Create the page index 
30           pages = db.index("Pages", create=True) 
31           page_node = pages[url] # does this page exist yet? 
32           if not page_node: 
33               page_node = db.node(url=url) # create a page 
34               pages[ url ] = page_node # Add to index 
35               print "Created:" , url 
36           else: 
37               print "Exists already:" , url 
38    
39           print len(result.entities), "Calais Entities" 
40           for e in result.entities: 
41               print result.doc['info']['externalID']    #URL 
42               entity_type =  e['_type'] 
43               entity_value = e['name'] 
44               relevance = e['relevance'] 
45               instances = e['instances'] 
46    
47               print entity_type, entity_value, relevance, instances #instances is a list of contexts 
48    
49               #Create an entity 
50               entity = db.node(value=entity_value, relevance=relevance ) 
51               entity_type = db.node( name= entity_type ) 
52               entity_type.is_a( entity )    # e.g Amazon is_a Company 
53               page_node.has( entity_type ) 
54    
55       db.shutdown() 
56     
59    
60   def print_result(result): 
61       'Custom code to just show certain bits of the result obj' 
62       result.print_summary( ) 
63    
64       print "Entities" 
65       for e in result.entities: 
66           print e['_type'], ":", (e['name']) 
67           print e.keys() 
68           print 
69    
70       print "Topics" 
71       for t in result.topics: 
72           print t['category'] 
73           #print t 
74    
75       print 
76       print "Relations" 
77       print result.print_relations( ) 
78    
79    
80    
81   suffixes_to_avoid=[ 'css','js','zip', ] 
82   def get_links( data, url='',suffixes_to_avoid=suffixes_to_avoid ): 
83       'I know I should use BeautifulSoup or lxml' 
84       links = [ ] 
85       icos = [ ] 
86       feeds = [ ] 
87       images = [ ] 
88       try: 
89           found_links = re.findall(' href="?([^\s^"\'#]+)', data)# think this strips off anchors 
90    
91           for link in found_links: 
92               # fix up relative links 
93               link = urlparse.urljoin(url, link) 
94               link = link.replace("/..", "") # fix relative links 
95    
96               #check to see if path is just "/"... for example, http://theotherblog.com/ 
97               path = urlparse.urlsplit(link)[2] 
98               if path  == "/": 
99                   link = link[:-1]#take off the trailing slash (or should we put it on?) 
100   
101              #just in case fixups 
102              link = link.replace("'", "") 
103   
104              if link not in links and link[:7] == 'http://' : #avoid mailto:, https:// 
105                  if "." in path: 
106                      suffix_found = path.split(".")[-1] 
107                      print suffix_found, link 
108                      if suffix_found  in suffixes_to_avoid: 
109                          pass 
110                      else: 
111                          if suffix_found == 'ico': 
112                              icos.append( link ) 
113                          elif suffix_found == 'rss' or suffix_found=='xml': 
114                              feeds.append ( link ) 
115                          elif suffix_found in ['gif', 'png', 'jpg', 'jpeg']: 
116                              images.append( link ) 
117                          else: 
118                              links.append( link ) 
119                  else: 
120                      links.append( link ) 
121   
122   
123      except Exception, err: 
124          print err 
125   
126      return links, icos, images, feeds 
127   
128  def get_images(content, url): 
129      #ico 
130      #feeds 
131      #images 
132      #text 
133      found_images = re.findall(' src="?([^\s^"\'#]+)', content) 
134      images = [ ] 
135   
136   
137      for image in found_images: 
138          image = urlparse.urljoin(url, image) 
139          image = image.replace("/..", "") 
140          path = urlparse.urlsplit(image)[2] 
141           
142          if path[-3:] == ".js": 
143              pass 
144          else: 
145               images.append( image ) 
146      return images 
147   
148  def get_icos(content, url): 
149      #href="/static/img/favicon.ico" 
150      #image/x-icon  
151      found_images = re.findall(' href="?([^\s^"\'#]+\.ico)', content) 
152      images = [ ] 
153   
154   
155      for image in found_images: 
156          image = urlparse.urljoin(url, image) 
157          image = image.replace("/..", "") 
158          path = urlparse.urlsplit(image)[2] 
159   
160          if path[-3:] == ".js": 
161              pass 
162          else: 
163               images.append( image ) 
164               
165      print images 
166      if len(images)> 0: 
167          print len(images), "icos" 
168          image = images[ 0 ] #just get the first 
169      else: 
170          #try a default... you never know... might be aliased anyway... 
171          scheme, domain, path, query, x = urlparse.urlsplit( url ) 
172          image = '%s://%s/favicon.ico' % (scheme, domain) 
173           
174      return image 
175   
176   
177  mimes_u_like = [ 'text/html', 'application/pdf', ] 
178  def get( url, mimes_u_like=mimes_u_like, fetch_links=False ): 
179      url, status, message, headers, content  = fetch(url) 
180      mime = headers['content-type'].split(";")[0].strip() 
181   
182      page = {'url':url, 
183              'mime':mime, 
184              'status':status, 
185              'message':message, 
186              'headers':headers, 
187              'content':content, 
188              'links':[], 
189              'images':[], 
190              'feeds': [] } 
191   
192   
193      ico =  get_icos( content, url ) 
194      page['ico'] = ico 
195       
196      images_found =  get_images( content, url ) 
197      page['images'] = images_found 
198       
199      if fetch_links == False: 
200          return page     # this is wrong... I want to discover the the links without necessarily getting their data 
201      else: 
202          links_found, icos, images, feeds = get_links( content, url ) 
203          print url 
204          print len( links_found ), "links found" 
205          links = [ ] 
206          print len( icos ), "icos" 
207          print len( images ), "images" 
208          print images 
209          print len( feeds ), "feeds" 
210   
211   
212          for link in links_found: 
213              url, status, message, headers, content  = fetch(link) 
214              mime = headers['content-type'].split(";")[0].strip() 
215              if mime in mimes_u_like: 
216                  print "\t", status, mime, link 
217                  if status == 200: 
218                      links.append( {'url':url, 
219                                     'status':status, 
220                                     'message':message, 
221                                     'headers':headers, 
222                                     'mime':mime, 
223                                     'content':content} 
224                                    ) 
225   
226          print len(links) , "Used" 
227          page['links'] = links 
228          return page 
229   
230  if __name__ == '__main__': 
231      url = 'http://theotherblog.com' 
232      print "Getting:", url 
233      page = get( url, fetch_links=True ) 
234      
235      result = analyze( url ) 
236      neo_result ( result, url ) 
237   
238      print "Getting %s links.." % str(len(page['links'])) 
239      for link_dict in page['links']: 
240          print "Getting link:", link_dict['url'] 
241          try: 
242              result = analyze( link_dict['url'] ) 
243              neo_result ( result, link_dict['url'] ) 
244          except Exception, err: 
245              print    link_dict['url'], err 
246   
247      print "Done!" 




Wednesday, 4 August 2010

Django and Neo4J

So, today I was tempted by the approach taken in these slides about Neo4J and Django. My hope was/is that whilst the Neo4J implementation isn't complete, because I imagine many of the concepts simply won't easily map from a graph database to a SQL one, at least I might be able to work in a more familiar way in order to be able to learn about neo4j.

I started by grabbing the files from the model folder from here.... adding...

NEO4J_RESOURCE_URI = '/Users/tomsmith/neo4django_db'  

...to my settings.py file and then adding this code to a models.py file...


from neo4j.model import django_model as models



class Actor(models.NodeModel):
    name= models.Property(indexed=True)
    href = property(lambda self:('/actor/%s/' % self.node.id))

    def __unicode__(self):
return self.name

class Movie(models.NodeModel):
    title = models.Property(indexed=True)
    year = models.Property()
    href = property(lambda self:('/movies/%s/' % self.node.id))

    actors = models.Relationship(Actor,type=models.Outgoing.ACTS_IN,related_name="acts_in",)

    def title_length(self):
return len(self.title)

    def __unicode__(self):
return self.title


... and then from within python manage.py shell I could...


>> import stuff.models as m


>> movie = m.Movie(title="Fear and Loathing in Las Vegas",year=1998)
>> movie.save()

>>> movie.save()
>>> movie.id
4L
>>>>> movie = m.Movie(title="The Brothers Grimm",year=2005) 
>>> movie.save()
>>> movie.id
5L
>>> movie = m.Movie(title="Twelve Monkeys",year=1995) 
>>> movie.save()
>>> movie = m.Movie(title="Memento",year=2000) 
>>> movie.save()

... now I can...

>> movies = m.Movie.objects.all()
>>> for movie in movies:print movie.id, movie, movie.href
... 
7 Memento /movies/7/
6 Twelve Monkeys /movies/6/
5 The Brothers Grimm /movies/5/
4 Fear and Loathing in Las Vegas /movies/4/
3 The Book of Eli /movies/3/
1 10,000 years B.C. /movies/1/
>>> 


... which is fantastic! Don't you think?! The django_model is handling the db.transaction stuff. I can even do a ...

>> reload( m ) 

... and it doesn't break (because the database is already connected) as my earlier code did. (I think I can even run my django instance AND be connected to the neo4j database at the same time.. phew!).

 What I think is fantastic about it is that using this approach I get to work with objects in a familiar way. And by that, I mean, having a schema-less database is all well and good, but almost the first thing I seem to find myself doing is creating similar types of objects. So being able to create classes (maybe even with subclasses) seems perfect for my needs ... in theory. In theory, I'd like to be able to define classes with properties but then on-the-fly maybe add extra properties to objects (without changing my class definition).

AND I can add methods to my classes... or can I ?  I added a simple function to my Movie class...

     def title_length(self):
return len(self.title)

...and ran... 

>>> movies = m.Movie.objects.all()
>>> for movie in movies:print movie.id, movie, movie.href, movie.title_len()

...and got...

AttributeError: 'Movie' object has no attribute 'title_len'

... Boo! At this point after doing looking at the Movie objects, I found ...

>>> dir( m.Movie.objects )
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_copy_to_model', '_inherited', '_insert', '_set_creation_counter', '_update', 'aggregate', 'all', 'annotate', 'complex_filter', 'contribute_to_class', 'count', 'create', 'creation_counter', 'dates', 'defer', 'distinct', 'exclude', 'extra', 'filter', 'get', 'get_empty_query_set', 'get_or_create', 'get_query_set', 'in_bulk', 'iterator', 'latest', 'model', 'none', 'only', 'order_by', 'reverse', 'select_related', 'update', 'values', 'values_list']

...so I did a ...

>>> movies = m.Movie.objects.all().order_by('year')

...and got ...

Traceback (most recent call last):
  File "", line 1, in
AttributeError: 'NodeQuerySet' object has no attribute 'order_by'

...and the same is true for .get_or_create() or .values() or many other methods.

... so (and this isn't a criticism) it seems that the Django model is a "work in progress" (the cat ate his source code) with some of the methods still to be completed. At this point I should probably start looking at the django_model source code and begin adding to the functionality it offers... except ...

  1. I probably don't have the ability to create a database adapter, especially for one I don't understand
  2. I don't know if shoe-horning neo4j into Django is a good idea. I'm not saying it isn't ( so far, it's tantalising ) I'm just saying I don't know.
  3. When I find errors I'm not sure if it's me or not...
...for example, I tried...

>>> actor = m.Actor(name="Joseph Melito")
>>> actor.save()
>>> actor.id
10L
>>> actor = m.Actor(name="Bruce Willis")
>>> actor.save()
>>> actor = m.Actor(name="Jon Seda")
>>> actor.save()

...and all was fine and dandy and then I tried ...

>>> movie = m.Movie.objects.get(title="Twelve Monkeys")
>>> movie.actors.add ( actor )

... and got...

Traceback (most recent call last):
  File "", line 1, in
  File "/opt/local/lib/python2.5/site-packages/Neo4j.py-0.1_SNAPSHOT-py2.5.egg/neo4j/model/django_model/__init__.py", line 623, in __get__
    return self._get_relationship(obj, self.__state_for(obj))
  File "/opt/local/lib/python2.5/site-packages/Neo4j.py-0.1_SNAPSHOT-py2.5.egg/neo4j/model/django_model/__init__.py", line 708, in _get_relationship
    states[self.name] = state = RelationshipInstance(self, obj)
  File "/opt/local/lib/python2.5/site-packages/Neo4j.py-0.1_SNAPSHOT-py2.5.egg/neo4j/model/django_model/__init__.py", line 750, in __init__
    self.__removed = pyneo.python.set() # contains relationships
AttributeError: 'module' object has no attribute 'set'

... I wonder if Tobias has done more work on the Relationships class at all?






Tuesday, 3 August 2010

Visualisation of a Neo4J database with Gephi

1.  Get the version of Gephi app that can read neo4j databases (not the main one)
bzr branch http://bazaar.launchpad.net/~bujacik/gephi/support-for-neo4j

2. Get Netbeans, open the project you've just downloaded and build it as a Mac application (you'll find it in a folder called dist). As it happens, Netbean is quite a lovely python IDE, once you've added a few plugins. I added a python plugin, a regular expression one and found I could pretty much use the python debugger out of the box (more than I can say for Eclipse) and watch variables etc. Very nice.

3. Run the code from yesterday and open the DB that gets created with Gephi... alter the layouts and you get something like this...


... which was a small crawl of a small site (showing IDs).

I have no absolutely no idea what this all means but isn't it lovely to look at? I think I'm onto something.

Working Neo4J / Python code


So, this sort of works. The team at neo4j have made the iteration work for me... Usage:

>>python2.6 neo.py  http://www.wherever.com

... and what it does it go to that web site and grabs a few links, following them, adding the pages and their data to a neo4j graph database.

import neo4j #See http://components.neo4j.org/neo4j.py/
import urllib2, traceback, re, urlparse, socket, sys, random, time

socket.setdefaulttimeout(4) #seconds

'''This is my attempt to begin to make proper classes and functions to talk to the neo4j database, it's not meant to be fancy or clever, I just want to be able to CRUD ok-ish'''


try:
 db = neo4j.GraphDatabase("crawler_example3_db")
 with db.transaction:
  #How do we delete an index... which might have old nodes in? This doesn't work...any ideas... how do I empty an index, should I? Does deleting a node delete its refererence from the index
  '''try:
   for ref in db.index('pages'):
    ref.delete()
  except Exception, err:
   print err
  '''
  pages = db.index("pages", create=True) # create an index called 'pages'
  #print "Index created"
 #return db, pages
except Exception, err:
 print err
 #print "db:", db


#############     UTILITY FUNCTIONS         #############

def get_links( data, url=''):
 'I know I should use BeautifulSoup or lxml, but for simplicity it's a regex..ha'
 links = [ ]
 try:
  found_links = re.findall(' href="?([^\s^"\'#]+)', data)
  
  for link in found_links:
   #print link
   link = urlparse.urljoin(url, link)
   link = link.replace("/..", "") #fix relative links
   link = link.replace("'", "")
   if link not in links and link[:7] == 'http://' :#avoid mailtos
    links.append( link )
 except Exception, err:
  print err
  
 return links
 
def wget(url):
 try:
  handle = urllib2.urlopen(url)
  if 'text/html' in handle.headers['content-type']:
   data = unicode(handle.read(),  errors='ignore')
   return data
 except Exception, e:
  print "Error wgetting", e, url
  #print traceback.print_exc()
 return None

def delete_all_pages():
 with db.transaction:
  try:
   for node in db.node:
    id  = node.id
    node.delete( )
    print id, "deleted"
  except Exception, err:
   print err
   pass # fails on last iteration?
 print "All pages deleted!"


def create_page( url, code=200 , follow=True):
 '''this creates a page node then adds it to the "pages" index, if it's there already it'll get it'''
 
 # Get the actual page data from the web...
 data = wget( url ) #get the actual html
 if not data:
  return None
  print "Boo!", url
 else:
  data_len = len(data)
  page_node = pages[url] # does this page exist yet?
  if not page_node:
   page_node = db.node(url=url, code=code, data=str(data), data_len=data_len) # create a page
   pages[ url ] = page_node # Add to index
   print "Created:" , url
  else:
   print "Exists already:" , url
   
  #Now create a page for every link in the page
  if follow == True:
   print "\tfollowing links"
   i = 0
   links = get_links(data, url)
   print len( links ) , "found"
   if len(links) > 20:
    links = links[:19]
   for link in links:
    print "\tcreating:",  link
    create_page( link, follow=False)
  return page_node

def get_one(url):
 'given a url with return a node obj'
 with db.transaction:
  try:
   node = pages [ url ]
   if node == None:
    print "Node is none! Creating..."
    create_page( url, code=200 , follow=False)
   return node
  except Exception, err:
   print err
   
 return None
 

def list_all_pages( ):
  # Just iterate through the pages to make sure the data in in there...
  print "Listing all pages..."
 with db.transaction: 
  for node in db.node:
   try:
    print node
    #print node['data'][:150], "..."
    print 
   except Exception, err:
    print err
 print "...done listing!"


def delete_one( url ):
 ''
 with db.transaction:
  try:
   node = pages[ url ]
   if node:
    node.delete( )
    print "Node with id=", node.id, "deleted"
    #delete from index
    del pages[url]
  except Exception, err:
   print "Node probably not found:", err
   print dir(node) # let's have a look



def find_links_between_pages( ):
 #'Just iterate through the pages to make sure the data in in there...'
  print "Linking all pages..."
 with db.transaction:
  try: 
   for node in db.node:
    try:
     print str(node), node['url'], node['data_len']
     links = get_links( node['data'], node['url'])
     for link in links:
      try:
       print link
       #look to see if a node with that url exists, if it doesn't it's created...
       other_node = get_one( link )
       if not other_node:
        pass
       else:
        other_node.links_to( node )
      except Exception, err:
       print err
     print
    except Exception, err:
     print err
    print 
  except Exception, err:
   pass # fails on last iteration?
   print err


class Backlink(neo4j.Traversal):
 types = [neo4j.Outgoing.links_to]
 order = neo4j.DEPTH_FIRST
 stop = neo4j.STOP_AT_END_OF_GRAPH
  
 def isReturnable(self, pos):
  if pos.is_start: return False
  else:return pos.last_relationship.type == 'links_to'


if __name__ == '__main__':

 #### LET'S GET STARTED ######
try:
  sys.argv[1]
  url = sys.argv[1]
 except:
  print "No url passed, using 'http://diveintopython.org/'"
  url = 'http://diveintopython.org/'
  
 print "Starting:", url
 
 ######## DELETE ALL THE PAGES WE HAVE SO FAR ############
 # Avoiding this because of ...
 #jpype._jexception.RuntimeExceptionPyRaisable: org.neo4j.graphdb.NotFoundException: Node[0] 
 #... errors later...
 #print "Deleting existing pages..."
 #delete_all_pages( ) #we may want to add new data to each page... forget this for now
 
 ########        ADD SOME ACTUAL PAGES        #################
 #print "Creating some pages "
 with db.transaction:
  create_page(  url, 200, follow=True ) #Also fetches some pages linked from this page
 
 ########    NOW GET SOME NODES OUT     ############### 
 print
 print "Has our data made it to the database? Let's see..."
 # Do some fishing for nodes...
 with db.transaction:
  try:
   node = get_one( url )
   print '\tget one:' + str( node ) , "... oh yes!"
   print "\tid:", node.id , "url:", node['url'], "data-length:", node['data_len']
   print 
   
   #This should fail
   #print "This SHOULD fail.."
   #node = get_one( 'http://www.pythonware.com/daily/' )
   #print 'get one:' + str( node ) 
   #print "id:", node.id , "url:", node['url'], "data length:", node['data_len']
   #print 
   
  except Exception, err:
   print "Probably not a node with that URL"
   print err 
 print
 
 #########  TRY TO ITERATE ALL PAGES FETCHED ################
 list_all_pages(  )
 
 #########    TRY TO DELETE ONE ################
 #delete_one( url ) 
 # Now let's see if it has gone
 #list_all_pages( ) #or maybe later
 
 ######### LET'S LOOK FOR RELATIONSHIPS BETWEEN PAGES ################
 print
 print "Doing linking pages..."
 find_links_between_pages( ) #goes to every page, looking for other pages in the database
 
 print
 print "shutting down, saving all the changes, etc"
 db.shutdown()