So, today after having spent the morning getting our LDAP server to talk to our Cyn.in instance (it worked!) I thought I'd have a bash at this. The plan was/is to create a crawler that gets a web page, finds the pages that it links to (including Word or PDF files), pumps any data found at the Open Calais API and saves the semantic Entities returned into a Neo4J database then go and look at it with the Gephi visualising tool ( see previous posts ).
This is all kinda new to me, I have no idea if what I'm doing makes sense, but I hope that once I can "look" at data, I'll be able to figure out a way of pruning it into something usable.
The (ropey) code is here and the visualisation is shown above. I have no idea if this will be "traversable" yet, but it kind of proves to me that it's doable. Ideally I want to crawl a given pile of pages into one big soup and then get from page B to page X and "discover" the shortest route between them...
How hard can it be :-)
5 import urllib2, urllib, traceback, re, urlparse, socket, sys, random, time 6 from pprint import pprint 7 import codecs, sys 8 from calais import Calais #http://code.google.com/p/python-calais/ 9 from scraper import fetch #http://zesty.ca/python/scrape.py 10 11 streamWriter = codecs.lookup('utf-8')[-1] 12 sys.stdout = streamWriter(sys.stdout) 13 14 API_KEY = 'YOUR_KEY_HERE' 15 calaisapi = Calais(API_KEY, submitter="python-calais demo") 16 17 def analyze(url=None): 18 result = calaisapi.analyze_url( url ) 19 return result 20 21 def neo_result(result, url): 22 import neo4j 23 24 db = neo4j.GraphDatabase( "simple_neo_calais_test" ) 25 26 result.print_summary( ) 27 28 with db.transaction: 29 # Create the page index 30 pages = db.index("Pages", create=True) 31 page_node = pages[url] # does this page exist yet? 32 if not page_node: 33 page_node = db.node(url=url) # create a page 34 pages[ url ] = page_node # Add to index 35 print "Created:" , url 36 else: 37 print "Exists already:" , url 38 39 print len(result.entities), "Calais Entities" 40 for e in result.entities: 41 print result.doc['info']['externalID'] #URL 42 entity_type = e['_type'] 43 entity_value = e['name'] 44 relevance = e['relevance'] 45 instances = e['instances'] 46 47 print entity_type, entity_value, relevance, instances #instances is a list of contexts 48 49 #Create an entity 50 entity = db.node(value=entity_value, relevance=relevance ) 51 entity_type = db.node( name= entity_type ) 52 entity_type.is_a( entity ) # e.g Amazon is_a Company 53 page_node.has( entity_type ) 54 55 db.shutdown() 56 59 60 def print_result(result): 61 'Custom code to just show certain bits of the result obj' 62 result.print_summary( ) 63 64 print "Entities" 65 for e in result.entities: 66 print e['_type'], ":", (e['name']) 67 print e.keys() 68 print 69 70 print "Topics" 71 for t in result.topics: 72 print t['category'] 73 #print t 74 75 print 76 print "Relations" 77 print result.print_relations( ) 78 79 80 81 suffixes_to_avoid=[ 'css','js','zip', ] 82 def get_links( data, url='',suffixes_to_avoid=suffixes_to_avoid ): 83 'I know I should use BeautifulSoup or lxml' 84 links = [ ] 85 icos = [ ] 86 feeds = [ ] 87 images = [ ] 88 try: 89 found_links = re.findall(' href="?([^\s^"\'#]+)', data)# think this strips off anchors 90 91 for link in found_links: 92 # fix up relative links 93 link = urlparse.urljoin(url, link) 94 link = link.replace("/..", "") # fix relative links 95 96 #check to see if path is just "/"... for example, http://theotherblog.com/ 97 path = urlparse.urlsplit(link)[2] 98 if path == "/": 99 link = link[:-1]#take off the trailing slash (or should we put it on?) 100 101 #just in case fixups 102 link = link.replace("'", "") 103 104 if link not in links and link[:7] == 'http://' : #avoid mailto:, https:// 105 if "." in path: 106 suffix_found = path.split(".")[-1] 107 print suffix_found, link 108 if suffix_found in suffixes_to_avoid: 109 pass 110 else: 111 if suffix_found == 'ico': 112 icos.append( link ) 113 elif suffix_found == 'rss' or suffix_found=='xml': 114 feeds.append ( link ) 115 elif suffix_found in ['gif', 'png', 'jpg', 'jpeg']: 116 images.append( link ) 117 else: 118 links.append( link ) 119 else: 120 links.append( link ) 121 122 123 except Exception, err: 124 print err 125 126 return links, icos, images, feeds 127 128 def get_images(content, url): 129 #ico 130 #feeds 131 #images 132 #text 133 found_images = re.findall(' src="?([^\s^"\'#]+)', content) 134 images = [ ] 135 136 137 for image in found_images: 138 image = urlparse.urljoin(url, image) 139 image = image.replace("/..", "") 140 path = urlparse.urlsplit(image)[2] 141 142 if path[-3:] == ".js": 143 pass 144 else: 145 images.append( image ) 146 return images 147 148 def get_icos(content, url): 149 #href="/static/img/favicon.ico" 150 #image/x-icon 151 found_images = re.findall(' href="?([^\s^"\'#]+\.ico)', content) 152 images = [ ] 153 154 155 for image in found_images: 156 image = urlparse.urljoin(url, image) 157 image = image.replace("/..", "") 158 path = urlparse.urlsplit(image)[2] 159 160 if path[-3:] == ".js": 161 pass 162 else: 163 images.append( image ) 164 165 print images 166 if len(images)> 0: 167 print len(images), "icos" 168 image = images[ 0 ] #just get the first 169 else: 170 #try a default... you never know... might be aliased anyway... 171 scheme, domain, path, query, x = urlparse.urlsplit( url ) 172 image = '%s://%s/favicon.ico' % (scheme, domain) 173 174 return image 175 176 177 mimes_u_like = [ 'text/html', 'application/pdf', ] 178 def get( url, mimes_u_like=mimes_u_like, fetch_links=False ): 179 url, status, message, headers, content = fetch(url) 180 mime = headers['content-type'].split(";")[0].strip() 181 182 page = {'url':url, 183 'mime':mime, 184 'status':status, 185 'message':message, 186 'headers':headers, 187 'content':content, 188 'links':[], 189 'images':[], 190 'feeds': [] } 191 192 193 ico = get_icos( content, url ) 194 page['ico'] = ico 195 196 images_found = get_images( content, url ) 197 page['images'] = images_found 198 199 if fetch_links == False: 200 return page # this is wrong... I want to discover the the links without necessarily getting their data 201 else: 202 links_found, icos, images, feeds = get_links( content, url ) 203 print url 204 print len( links_found ), "links found" 205 links = [ ] 206 print len( icos ), "icos" 207 print len( images ), "images" 208 print images 209 print len( feeds ), "feeds" 210 211 212 for link in links_found: 213 url, status, message, headers, content = fetch(link) 214 mime = headers['content-type'].split(";")[0].strip() 215 if mime in mimes_u_like: 216 print "\t", status, mime, link 217 if status == 200: 218 links.append( {'url':url, 219 'status':status, 220 'message':message, 221 'headers':headers, 222 'mime':mime, 223 'content':content} 224 ) 225 226 print len(links) , "Used" 227 page['links'] = links 228 return page 229 230 if __name__ == '__main__': 231 url = 'http://theotherblog.com' 232 print "Getting:", url 233 page = get( url, fetch_links=True ) 234 235 result = analyze( url ) 236 neo_result ( result, url ) 237 238 print "Getting %s links.." % str(len(page['links'])) 239 for link_dict in page['links']: 240 print "Getting link:", link_dict['url'] 241 try: 242 result = analyze( link_dict['url'] ) 243 neo_result ( result, link_dict['url'] ) 244 except Exception, err: 245 print link_dict['url'], err 246 247 print "Done!"