require "net/http" require "set" def bytesToString(bytes) bytes_string = "" if (bytes < 1000000) bytes_string = (bytes/1000).to_s + " KB" elsif (bytes < 1000000000) bytes_string = (bytes/1000000).to_s + " MB" else bytes_string = (bytes/1000000000).to_s + " GB" end bytes_string end if ARGV.length >= 2 start = ARGV[0] goal = ARGV[1] else start = "Gray_Whale" goal = "Atari" end if ARGV.length >= 3 branchLimit = ARGV[2].to_i else branchLimit = 100 end puts "start = " + start puts "goal = " + goal class Node def initialize(previous, page) @previous = previous @page = page if previous @depth = previous.depth + 1 else @depth = 0 end end attr_accessor :depth attr_accessor :previous attr_accessor :page end count = 0 total_branches = 0 download_count = 0 total_size = 0 size_num = 0 q = Array.new q.push(Node.new(nil, start)) closed = Set.new closed.add(start) result = nil found = false while !q.empty? && !found # get first item from queue current = q.first q.delete_at(0) count += 1 # download article, process if successful currUrl = "http://en.wikipedia.org/wiki/" + current.page resp = Net::HTTP.get_response(URI.parse(currUrl)) if resp.is_a? Net::HTTPSuccess outgoing = Hash.new # scrape all URLs to other articles (/wiki/) urls = resp.body.scan(/href="\/wiki\/[^#][^"]*"/) urls.each do |url| page = url[12..-2] page.slice!(/#.*/) if outgoing[page] != nil outgoing[page] += 1 elsif page =~ /:/ || page == "Wiktionary" || page == "Wikibooks" || page == "501%28c%29" # do nothing else outgoing[page] = 1 end end branch = outgoing.size size = resp.body.length total_branches += branch total_size += size download_count += 1 # ignore article if too many links explore = (branch < branchLimit) || q.empty? # look for matching article out = outgoing.to_a out.each do |url| if !closed.include?(url[0]) if (url[0].downcase == goal.downcase) result = Node.new(current, url[0]) found = true puts "++++++++ found! " + result.page + " ++++++++" elsif explore # add to queue q.push(Node.new(current, url[0])) closed.add(url[0]) end end end # print the article we just explored if explore puts "[" + count.to_s + "] [depth: " + current.depth.to_s + "] [branch: " + branch.to_s + "] [avg branch: " + (total_branches / download_count).to_s + "] [size: " + size.to_s + "] " + "[avg size: " + (total_size / download_count).to_s + "] " + current.page end else puts "*** error retrieving " + currUrl + " ***" end end # print path curr = result path = "" while curr if (path != "") path = " --> " + path end path = curr.page + path curr = curr.previous end puts path # print various statistics puts " +--------------------------- Stats ---------------------------+ " puts " | articles downloaded: " + download_count.to_s + " (" + count.to_s + " attempted)" puts " | bytes downloaded: " + bytesToString(total_size) + " (" + bytesToString(total_size/download_count) + " avg per article)" puts " | total links: " + total_branches.to_s + " (" + (total_branches / download_count).to_s + " avg links per article)" puts " +-------------------------------------------------------------+ "