#
# Author: Jordan T. Cox
# Tested & Modified: David May
# Date: 2007/08/30
# Filename: link_crawler.rb
# Revision Date: 2007/10/03
# Revision Number: 0.8.1
# Purpose: A simple program to crawl through a web-site and spit out
# a listing of all the links it contains.
#
# CHANGELOG
# -----------
# 2007/09/10 - 0.6
# - ADDED support for removing query parameters from linked URIs.
# - FIXED support for absolute URIs which weren't parsing correctly
# before.
# - ADDED actual boolean parameters, requiring usage of true or false.
#
# 2007/09/11 - 0.7
# - CHANGED support for "strip_parameters" to take a list of GET parameters
# to strip. Can also now accept 'all' to strip all.
# - ADDED support for "keep_parameters" which will force certain GET
# parameters to remain, no matter whether they're in strip_parameters
# or not.
# - Fixed some error catching to handle errors in opening of a link in
# the initialize function.
# - (David) Chopped hanging "&" from last parameter on returned link
#
# 2007/09/12 - 0.8
# - (David) ADDED support for removing anchors from URL list (ie.
# http://www.domain.com#test) / Also removes hanging #'s
# - (David) ADDED support for supressing error warning messages from output
# 2007/10/03 - 0.8.1
# - (Jordan) Fixed a file format issue preventing most operating systems
# and versions of Ruby from running the script.
# --
# Copyright Information:
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# --
# I can be reached by leaving a comment on my blog @ blog.phantomdata.com
#
# TODO:
require "open-uri"
$crawled_uris=Array.new
$masks=[ /htm[l]?/,/asp[x]?/,/php[4|5]?/ ] # The default mask settings.
class Crawler
def initialize params
# { :uri,
# :cross_domains - Not yet implemented
# :masks - Only items matching regexes contained in this array
# will be included.
# :wait - Defaults to 1second in between grabs.
# }
raise "URI needed for class Crawler" unless params["uri"]
@uri=params["uri"]
b=open(@uri)
@baseuri=b.base_uri
@body=b.read()
@cross_domains = params["cross_domains"] ? params["cross_domains"] : false
@no_warning = params["no_warning"] ? params["no_warning"] : false
@no_anchors = params["no_anchors"] ? params["no_anchors"] : false
@domain = URI.parse(@uri).host
@keep_parameters=params["keep_parameters"] ? params["keep_parameters"] : []
@masks = params["masks"] ? params["masks"] : [/.*/]
@strip_parameters=params["strip_parameters"] ? params["strip_parameters"] : []
@wait = params["wait"] ? params["wait"].to_i : 1
end
def all_links()
self.links.each do |link|
next if $crawled_uris.include? link
$crawled_uris << link
yield link
sleep @wait
begin
c=Crawler.new( { "uri"=>link,
"cross_domains"=>@cross_domains,
"keep_parameters"=>@keep_parameters,
"masks"=>@masks, "wait" => @wait,
"strip_parameters"=>@strip_parameters,
"no_warning"=>@no_warning,
"no_anchors"=>@no_anchors
} )
c.all_links() { |link|
yield link
sleep @wait
}
rescue OpenURI::HTTPError => e
if !@no_warning
$stderr.puts "#{@uri}:: Error in #{link}: "+e
end
rescue Exception => e
if !@no_warning
$stderr.puts "#{@uri}:: Error in crawling. Received "+e
end
end
end
end
def links()
return @links if @links
@links=Array.new
@body.scan(/e
# $stderr.puts "Invalid link found in #{link}. Raised exception was #{e}"
#end
end
#$stderr.puts "No matching links found in #{@uri}. There were, however, #{@body.scan(/