require "net/http" require "rpdf2txt/parser" require "date" require "rubygems" require_gem "activerecord" # misc regular expressions constants datetimeRE = /[A-Z][a-z]{2} [0-9]{2}, [0-9]{4}-[A-Z][a-z]+ at [0-9]{2}:[0-9]{2}/ stampRE = /[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]+/ # connect to the database ActiveRecord::Base.establish_connection( :adapter => "mysql", :host => "host", :database => "database", :username => "username", :password => "password" ) class Incident < ActiveRecord::Base set_table_name "crimes" end def import_url(url) puts "================== Processing: " + url + " ==================" resp = Net::HTTP.get_response(URI.parse(url)) if resp.is_a? Net::HTTPSuccess # parse the pdf, extract the text, split into lines parser = Rpdf2txt::Parser.new(resp.body) text = parser.extract_text lines = text.split("\n") incidents = Array.new # array containing each incident summary = false # for multiple line summaries disp = false # for cases when the "disp" data is on the line after the "Disp:" header # try to match each line to a regular expression or other condition # then extract the data from the line lines.each do |line| # first line if (line =~ stampRE) # special case for missing identifier of previous incident if (incidents.size > 0 && incidents.last.identifier == nil) puts "+++ Last identifier is empty, searching for identifier in summary..." tempRE = /DR\#[\d]+/; tempId = incidents.last.summary[tempRE]; if (tempId != nil) puts "+++ Found! {" + tempId[3..tempId.length-1] + "}" incidents.last.identifier = tempId[3..tempId.length-1]; end end # create new incident incidents << Incident.new summary = false disp = false # extract category, subcategory, time, and stamp cat_subcat_index = line.slice(/[^a-z]*(?=[A-Z][a-z])/).length incidents.last.category = line[0..cat_subcat_index-1].strip incidents.last.subcategory = line[cat_subcat_index..line.index(datetimeRE)-1].strip incidents.last.time = DateTime.parse(line.slice(datetimeRE)) incidents.last.stamp = line.slice(stampRE) # identifier elsif (line =~ /^[0-9]+$/) incidents.last.identifier = line.slice(/^[0-9]+$/).to_i # location elsif (line =~ /Location:/) incidents.last.location = line.sub(/Location:/, "").strip # cc elsif (line =~ /cc:/) incidents.last.cc = line.sub(/cc:/, "").strip summary = false # disposition elsif (disp) incidents.last.disp = line.sub(/Disp:/, "").strip disp = false # summary elsif (line =~ /Summary:/ || summary) if (incidents.last.summary.nil?) incidents.last.summary = line.sub(/Summary:/, "").strip else incidents.last.summary << (" " + line.sub(/Summary:/, "").strip) end if (incidents.last.summary =~ /Disp:/) # find the "Disp:" header and data, remove from summary disp = incidents.last.summary.slice!(/\s*Disp:.*/) incidents.last.disp = disp.sub(/Disp:/, "").strip disp = (incidents.last.disp == "") # check that we actually got the "disp" data summary = false else summary = true end # no match else puts "discarding line: {" + line + "}" end end # at the end save each incident and print a list incidents.each do |incident| begin puts( ("%8d" % incident.identifier) + " " + ("%25s" % ("{" + incident.category + "}")) + " " + ("%45s" % ("{" + incident.subcategory + "}")) + " " + ("%60s" % ("{" + incident.location + "}"))); incident.save rescue Exception => exp puts exp end end end end if (ARGV.length > 0) # import each argument ARGV.each do |arg| import_url(arg) end else yesterday = Date.today - 1; urlToImport = "http://capsnet.usc.edu/DPS/webpdf/"+ ("%02d" % yesterday.mon) + ("%02d" % yesterday.mday) + yesterday.year.to_s[2..3] + ".pdf" import_url(urlToImport) end