require "mysql" require "set" debug_get_links = false debug_show_errors = false def bytesToString(bytes) bytes_string = "" if (bytes < 1000000) bytes_string = (bytes/1000).to_s + " KB" elsif (bytes < 1000000000) bytes_string = (bytes/1000000).to_s + " MB" else bytes_string = (bytes/1000000000).to_s + " GB" end bytes_string end if ARGV.length >= 2 start = ARGV[0] goal = ARGV[1] else start = "Gray_Whale" goal = "Atari" end if ARGV.length >= 3 branchLimit = ARGV[2].to_i else branchLimit = 100 end puts "start = " + start puts "goal = " + goal class Node def initialize(previous, page) @previous = previous @page = page if previous @depth = previous.depth + 1 else @depth = 0 end end attr_accessor :depth attr_accessor :previous attr_accessor :page end # open mysql connection m = Mysql.new("database_address", "username", "password") m.select_db("db_name") # statistics num_pages = 0 num_pages_attempted = 0 num_pages_not_found = 0 num_redirects = 0 num_redirects_attempted = 0 num_redirects_not_found = 0 total_branches = 0 total_size = 0 start_time = Time.now # keep track of open nodes q = Array.new q.push(Node.new(nil, start)) # keep track of closed nodes closed = Set.new closed.add(start) # store the result and signal it was found result = nil found = false while !q.empty? && !found num_pages_attempted += 1 current = q.first q.delete_at(0) outgoing = Hash.new article_title = current.page not_found = false redirect = false #puts "** article title: " + article_title if debug_get_links begin # get the page id, whether or not it is a redirect, and the length page_result = m.query("SELECT page_id, page_is_redirect, page_len FROM page WHERE page_namespace = 0 AND page_title = '" + Mysql.quote(article_title) + "'") if page_result.num_rows < 1 puts "Error: page not found (" + article_title + ")" if debug_show_errors num_pages_not_found += 1 not_found = true else not_found = false h = page_result.fetch_hash id = h['page_id'] size = h['page_len'].to_i puts "page_id: " + id + " page_is_redirect: " + h['page_is_redirect'] if debug_get_links if h['page_is_redirect'] == '1' num_redirects_attempted += 1 redir_result = m.query("SELECT rd_title FROM redirect WHERE rd_namespace = 0 AND rd_from = " + id) if redir_result.num_rows < 1 num_redirects_not_found += 1 not_found = true puts "Error: redirect not found (" + article_title + ")" if debug_show_errors else num_redirects += 1 r = redir_result.fetch_hash article_title = r['rd_title'] puts "rd_title: " + article_title if debug_get_links puts "Error: more than one redirect, skipping rest" if redir_result.num_rows > 1 && debug_show_errors end redirect = true else redirect = false end puts "Error: more than one article, skipping rest" if page_result.num_rows > 1 && debug_show_errors end puts "not_found: " + not_found.to_s + " redirect: " + redirect.to_s if debug_get_links end until not_found || (!not_found && !redirect) if !not_found #puts "Article id" links_result = m.query("SELECT pl_title FROM pagelinks WHERE pl_namespace = 0 AND pl_from = " + id) if links_result.num_rows < 1 puts "No links found" else links_result.each_hash do |h| page = h['pl_title'] #puts page if outgoing[page] != nil outgoing[page] += 1 elsif page == "Wiktionary" || page == "Wikibooks" || page == "501%28c%29" #puts page else outgoing[page] = 1 #puts page end end end branch = outgoing.size num_pages += 1 total_branches += branch total_size += size explore = (branch < branchLimit) || q.empty? out = outgoing.to_a out.each do |article| if !closed.include?(article[0]) if (article[0].downcase == goal.downcase) result = Node.new(current, article[0]) found = true puts "++++++++ found! " + result.page + " ++++++++" elsif explore q.push(Node.new(current, article[0])) closed.add(article[0]) end end end if explore puts "[" + num_pages_attempted.to_s + "] [depth: " + current.depth.to_s + "] [branch: " + branch.to_s + "] [avg branch: " + (total_branches / num_pages).to_s + "] [size: " + size.to_s + "] " + "[avg size: " + (total_size / num_pages).to_s + "] " + current.page end end end curr = result path = "" while curr if (path != "") path = " --> " + path end path = curr.page + path curr = curr.previous end puts path elapsed_time = Time.now - start_time puts " +--------------------------- Stats ---------------------------+ " puts " | pages: " + num_pages.to_s + " (" + num_pages_attempted.to_s + " attempted, " + num_pages_not_found.to_s + " failed, " + (num_pages.to_f/num_pages_attempted).to_s + " success rate)" puts " | redirects: " + num_redirects.to_s + " (" + num_redirects_attempted.to_s + " attempted, " + num_redirects_not_found.to_s + " failed, " + (num_redirects.to_f/num_redirects_attempted).to_s + " success rate)" puts " | bytes: " + bytesToString(total_size) + " (" + bytesToString(total_size/num_pages) + " avg per article)" puts " | total links: " + total_branches.to_s + " (" + (total_branches.to_f / num_pages).to_s + " avg links per article)" puts " | time elapsed: " + elapsed_time.to_s + " seconds" puts " +-------------------------------------------------------------+ "