Friday, 6 August 2010

Neo4J + Python Crawler + Open Calais + Gephi



So, today after having spent the morning getting our LDAP server to talk to our Cyn.in instance (it worked!) I thought I'd have a bash at this. The plan was/is to create a crawler that gets a web page, finds the pages that it links to (including Word or PDF files),  pumps any data found at the Open Calais API and saves the semantic Entities returned into a Neo4J database then go and look at it with the Gephi visualising tool ( see previous posts ).

This is all kinda new to me, I have no idea if what I'm doing makes sense, but I hope that once I can "look" at data, I'll be able to figure out a way of pruning it into something usable.

The (ropey) code is here and the visualisation is shown above. I have no idea if this will be "traversable" yet, but it kind of proves to me that it's doable. Ideally I want to crawl a given pile of pages into one big soup and then get from page B to page X and "discover" the shortest route between them...

How hard can it be :-)





5    import urllib2, urllib,  traceback, re, urlparse, socket, sys, random, time 
6    from pprint import pprint 
7    import codecs, sys 
8    from calais import Calais      #http://code.google.com/p/python-calais/ 
9    from scraper import fetch     #http://zesty.ca/python/scrape.py 
10    
11   streamWriter = codecs.lookup('utf-8')[-1] 
12   sys.stdout = streamWriter(sys.stdout) 
13    
14   API_KEY = 'YOUR_KEY_HERE' 
15   calaisapi = Calais(API_KEY, submitter="python-calais demo") 
16    
17   def analyze(url=None): 
18       result = calaisapi.analyze_url( url ) 
19       return result 
20    
21   def neo_result(result, url): 
22       import neo4j 
23    
24       db = neo4j.GraphDatabase( "simple_neo_calais_test" ) 
25    
26       result.print_summary( ) 
27        
28       with db.transaction: 
29           # Create the page index 
30           pages = db.index("Pages", create=True) 
31           page_node = pages[url] # does this page exist yet? 
32           if not page_node: 
33               page_node = db.node(url=url) # create a page 
34               pages[ url ] = page_node # Add to index 
35               print "Created:" , url 
36           else: 
37               print "Exists already:" , url 
38    
39           print len(result.entities), "Calais Entities" 
40           for e in result.entities: 
41               print result.doc['info']['externalID']    #URL 
42               entity_type =  e['_type'] 
43               entity_value = e['name'] 
44               relevance = e['relevance'] 
45               instances = e['instances'] 
46    
47               print entity_type, entity_value, relevance, instances #instances is a list of contexts 
48    
49               #Create an entity 
50               entity = db.node(value=entity_value, relevance=relevance ) 
51               entity_type = db.node( name= entity_type ) 
52               entity_type.is_a( entity )    # e.g Amazon is_a Company 
53               page_node.has( entity_type ) 
54    
55       db.shutdown() 
56     
59    
60   def print_result(result): 
61       'Custom code to just show certain bits of the result obj' 
62       result.print_summary( ) 
63    
64       print "Entities" 
65       for e in result.entities: 
66           print e['_type'], ":", (e['name']) 
67           print e.keys() 
68           print 
69    
70       print "Topics" 
71       for t in result.topics: 
72           print t['category'] 
73           #print t 
74    
75       print 
76       print "Relations" 
77       print result.print_relations( ) 
78    
79    
80    
81   suffixes_to_avoid=[ 'css','js','zip', ] 
82   def get_links( data, url='',suffixes_to_avoid=suffixes_to_avoid ): 
83       'I know I should use BeautifulSoup or lxml' 
84       links = [ ] 
85       icos = [ ] 
86       feeds = [ ] 
87       images = [ ] 
88       try: 
89           found_links = re.findall(' href="?([^\s^"\'#]+)', data)# think this strips off anchors 
90    
91           for link in found_links: 
92               # fix up relative links 
93               link = urlparse.urljoin(url, link) 
94               link = link.replace("/..", "") # fix relative links 
95    
96               #check to see if path is just "/"... for example, http://theotherblog.com/ 
97               path = urlparse.urlsplit(link)[2] 
98               if path  == "/": 
99                   link = link[:-1]#take off the trailing slash (or should we put it on?) 
100   
101              #just in case fixups 
102              link = link.replace("'", "") 
103   
104              if link not in links and link[:7] == 'http://' : #avoid mailto:, https:// 
105                  if "." in path: 
106                      suffix_found = path.split(".")[-1] 
107                      print suffix_found, link 
108                      if suffix_found  in suffixes_to_avoid: 
109                          pass 
110                      else: 
111                          if suffix_found == 'ico': 
112                              icos.append( link ) 
113                          elif suffix_found == 'rss' or suffix_found=='xml': 
114                              feeds.append ( link ) 
115                          elif suffix_found in ['gif', 'png', 'jpg', 'jpeg']: 
116                              images.append( link ) 
117                          else: 
118                              links.append( link ) 
119                  else: 
120                      links.append( link ) 
121   
122   
123      except Exception, err: 
124          print err 
125   
126      return links, icos, images, feeds 
127   
128  def get_images(content, url): 
129      #ico 
130      #feeds 
131      #images 
132      #text 
133      found_images = re.findall(' src="?([^\s^"\'#]+)', content) 
134      images = [ ] 
135   
136   
137      for image in found_images: 
138          image = urlparse.urljoin(url, image) 
139          image = image.replace("/..", "") 
140          path = urlparse.urlsplit(image)[2] 
141           
142          if path[-3:] == ".js": 
143              pass 
144          else: 
145               images.append( image ) 
146      return images 
147   
148  def get_icos(content, url): 
149      #href="/static/img/favicon.ico" 
150      #image/x-icon  
151      found_images = re.findall(' href="?([^\s^"\'#]+\.ico)', content) 
152      images = [ ] 
153   
154   
155      for image in found_images: 
156          image = urlparse.urljoin(url, image) 
157          image = image.replace("/..", "") 
158          path = urlparse.urlsplit(image)[2] 
159   
160          if path[-3:] == ".js": 
161              pass 
162          else: 
163               images.append( image ) 
164               
165      print images 
166      if len(images)> 0: 
167          print len(images), "icos" 
168          image = images[ 0 ] #just get the first 
169      else: 
170          #try a default... you never know... might be aliased anyway... 
171          scheme, domain, path, query, x = urlparse.urlsplit( url ) 
172          image = '%s://%s/favicon.ico' % (scheme, domain) 
173           
174      return image 
175   
176   
177  mimes_u_like = [ 'text/html', 'application/pdf', ] 
178  def get( url, mimes_u_like=mimes_u_like, fetch_links=False ): 
179      url, status, message, headers, content  = fetch(url) 
180      mime = headers['content-type'].split(";")[0].strip() 
181   
182      page = {'url':url, 
183              'mime':mime, 
184              'status':status, 
185              'message':message, 
186              'headers':headers, 
187              'content':content, 
188              'links':[], 
189              'images':[], 
190              'feeds': [] } 
191   
192   
193      ico =  get_icos( content, url ) 
194      page['ico'] = ico 
195       
196      images_found =  get_images( content, url ) 
197      page['images'] = images_found 
198       
199      if fetch_links == False: 
200          return page     # this is wrong... I want to discover the the links without necessarily getting their data 
201      else: 
202          links_found, icos, images, feeds = get_links( content, url ) 
203          print url 
204          print len( links_found ), "links found" 
205          links = [ ] 
206          print len( icos ), "icos" 
207          print len( images ), "images" 
208          print images 
209          print len( feeds ), "feeds" 
210   
211   
212          for link in links_found: 
213              url, status, message, headers, content  = fetch(link) 
214              mime = headers['content-type'].split(";")[0].strip() 
215              if mime in mimes_u_like: 
216                  print "\t", status, mime, link 
217                  if status == 200: 
218                      links.append( {'url':url, 
219                                     'status':status, 
220                                     'message':message, 
221                                     'headers':headers, 
222                                     'mime':mime, 
223                                     'content':content} 
224                                    ) 
225   
226          print len(links) , "Used" 
227          page['links'] = links 
228          return page 
229   
230  if __name__ == '__main__': 
231      url = 'http://theotherblog.com' 
232      print "Getting:", url 
233      page = get( url, fetch_links=True ) 
234      
235      result = analyze( url ) 
236      neo_result ( result, url ) 
237   
238      print "Getting %s links.." % str(len(page['links'])) 
239      for link_dict in page['links']: 
240          print "Getting link:", link_dict['url'] 
241          try: 
242              result = analyze( link_dict['url'] ) 
243              neo_result ( result, link_dict['url'] ) 
244          except Exception, err: 
245              print    link_dict['url'], err 
246   
247      print "Done!" 




No comments:

Post a Comment