#!/usr/bin/ruby # Uses hpricot : http://code.whytheluckystiff.net/hpricot/ require 'rubygems' require 'hpricot' require 'open-uri' def makeAbsolute( link, base ) ret = link if link != nil && ( link =~ /^\// || !( link =~ /^http(s)?:\/\// ) ) ret = base + link end return ret end baseURL = "http://startingurl.com/" visited = Hash.new workingQueue = Array.new workingQueue.push( baseURL ) # Ignore mailtos and javascript links ignoreRE = Regexp.new( '(javascript.*|mailto\:.*)' ) num = 0 curr = workingQueue.pop while curr != nil && num < 10000 num = num + 1 #puts " [-] Visiting: " + curr doc = Hpricot(open(curr)) # Do recursion (BFS using a queue) visited[ curr ] = 1 (doc/"a").each do |link| link = makeAbsolute( link.attributes[ 'href' ], baseURL ) next unless link != nil if visited[ link ] == nil && !( link =~ ignoreRE ) && link =~ Regexp.new( "^#{baseURL}" ) workingQueue.push( link ) else puts link + '|' + curr end end curr = workingQueue.pop end