# # Author: Jordan T. Cox # Tested & Modified: David May # Date: 2007/08/30 # Filename: link_crawler.rb # Revision Date: 2007/10/03 # Revision Number: 0.8.1 # Purpose: A simple program to crawl through a web-site and spit out # a listing of all the links it contains. # # CHANGELOG # ----------- # 2007/09/10 - 0.6 # - ADDED support for removing query parameters from linked URIs. # - FIXED support for absolute URIs which weren't parsing correctly # before. # - ADDED actual boolean parameters, requiring usage of true or false. # # 2007/09/11 - 0.7 # - CHANGED support for "strip_parameters" to take a list of GET parameters # to strip. Can also now accept 'all' to strip all. # - ADDED support for "keep_parameters" which will force certain GET # parameters to remain, no matter whether they're in strip_parameters # or not. # - Fixed some error catching to handle errors in opening of a link in # the initialize function. # - (David) Chopped hanging "&" from last parameter on returned link # # 2007/09/12 - 0.8 # - (David) ADDED support for removing anchors from URL list (ie. # http://www.domain.com#test) / Also removes hanging #'s # - (David) ADDED support for supressing error warning messages from output # 2007/10/03 - 0.8.1 # - (Jordan) Fixed a file format issue preventing most operating systems # and versions of Ruby from running the script. # -- # Copyright Information: # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # -- # I can be reached by leaving a comment on my blog @ blog.phantomdata.com # # TODO: require "open-uri" $crawled_uris=Array.new $masks=[ /htm[l]?/,/asp[x]?/,/php[4|5]?/ ] # The default mask settings. class Crawler def initialize params # { :uri, # :cross_domains - Not yet implemented # :masks - Only items matching regexes contained in this array # will be included. # :wait - Defaults to 1second in between grabs. # } raise "URI needed for class Crawler" unless params["uri"] @uri=params["uri"] b=open(@uri) @baseuri=b.base_uri @body=b.read() @cross_domains = params["cross_domains"] ? params["cross_domains"] : false @no_warning = params["no_warning"] ? params["no_warning"] : false @no_anchors = params["no_anchors"] ? params["no_anchors"] : false @domain = URI.parse(@uri).host @keep_parameters=params["keep_parameters"] ? params["keep_parameters"] : [] @masks = params["masks"] ? params["masks"] : [/.*/] @strip_parameters=params["strip_parameters"] ? params["strip_parameters"] : [] @wait = params["wait"] ? params["wait"].to_i : 1 end def all_links() self.links.each do |link| next if $crawled_uris.include? link $crawled_uris << link yield link sleep @wait begin c=Crawler.new( { "uri"=>link, "cross_domains"=>@cross_domains, "keep_parameters"=>@keep_parameters, "masks"=>@masks, "wait" => @wait, "strip_parameters"=>@strip_parameters, "no_warning"=>@no_warning, "no_anchors"=>@no_anchors } ) c.all_links() { |link| yield link sleep @wait } rescue OpenURI::HTTPError => e if !@no_warning $stderr.puts "#{@uri}:: Error in #{link}: "+e end rescue Exception => e if !@no_warning $stderr.puts "#{@uri}:: Error in crawling. Received "+e end end end end def links() return @links if @links @links=Array.new @body.scan(/e # $stderr.puts "Invalid link found in #{link}. Raised exception was #{e}" #end end #$stderr.puts "No matching links found in #{@uri}. There were, however, #{@body.scan(/