require "net/http"
require "rpdf2txt/parser"
require "date"

require "rubygems"
require_gem "activerecord"

# misc regular expressions constants
datetimeRE = /[A-Z][a-z]{2} [0-9]{2}, [0-9]{4}-[A-Z][a-z]+ at [0-9]{2}:[0-9]{2}/
stampRE = /[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]+/

# connect to the database
ActiveRecord::Base.establish_connection(
  :adapter  => "mysql",
  :host     => "host",
  :database => "database",
  :username => "username",
  :password => "password" 
)

class Incident < ActiveRecord::Base
  set_table_name "crimes"
end
  
def import_url(url)
  puts "================== Processing: " + url + " =================="
  
  resp = Net::HTTP.get_response(URI.parse(url))
  if resp.is_a? Net::HTTPSuccess
    # parse the pdf, extract the text, split into lines
    parser = Rpdf2txt::Parser.new(resp.body)
    text = parser.extract_text
    lines = text.split("\n")
    
    incidents = Array.new # array containing each incident
    summary = false       # for multiple line summaries
    disp = false          # for cases when the "disp" data is on the line after the "Disp:" header
    
    # try to match each line to a regular expression or other condition
    # then extract the data from the line
    lines.each do |line|
    
      # first line
      if (line =~ stampRE)

        # special case for missing identifier of previous incident
        if (incidents.size > 0 && incidents.last.identifier == nil) 
          puts "+++ Last identifier is empty, searching for identifier in summary..."
          tempRE = /DR\#[\d]+/;
          tempId = incidents.last.summary[tempRE];
          if (tempId != nil) 
            puts "+++ Found! {" + tempId[3..tempId.length-1] + "}"
            incidents.last.identifier = tempId[3..tempId.length-1];
          end
        end
    
        # create new incident
        incidents << Incident.new
        summary = false
        disp = false
  
        # extract category, subcategory, time, and stamp
        cat_subcat_index = line.slice(/[^a-z]*(?=[A-Z][a-z])/).length
        incidents.last.category = line[0..cat_subcat_index-1].strip
        incidents.last.subcategory = line[cat_subcat_index..line.index(datetimeRE)-1].strip
        incidents.last.time = DateTime.parse(line.slice(datetimeRE))
        incidents.last.stamp = line.slice(stampRE)
        
      # identifier
      elsif (line =~ /^[0-9]+$/)
        incidents.last.identifier = line.slice(/^[0-9]+$/).to_i
        
      # location
      elsif (line =~ /Location:/)
        incidents.last.location = line.sub(/Location:/, "").strip
        
      # cc
      elsif (line =~ /cc:/)
        incidents.last.cc = line.sub(/cc:/, "").strip
        summary = false
      
      # disposition
      elsif (disp) 
        incidents.last.disp = line.sub(/Disp:/, "").strip
        disp = false
      
      # summary
      elsif (line =~ /Summary:/ || summary)
        if (incidents.last.summary.nil?)
          incidents.last.summary = line.sub(/Summary:/, "").strip
        else
          incidents.last.summary << (" " + line.sub(/Summary:/, "").strip)
        end
    
        if (incidents.last.summary =~ /Disp:/)
          # find the "Disp:" header and data, remove from summary
          disp = incidents.last.summary.slice!(/\s*Disp:.*/)
          incidents.last.disp = disp.sub(/Disp:/, "").strip
          
          disp = (incidents.last.disp == "") # check that we actually got the "disp" data
          summary = false
        else
          summary = true
        end
      
      # no match
      else
        puts "discarding line: {" + line + "}"
      end
    end
    
    # at the end save each incident and print a list
    incidents.each do |incident|
      begin
        puts(	("%8d" % incident.identifier) + " " +
              ("%25s" % ("{" + incident.category    + "}")) + " " +
              ("%45s" % ("{" + incident.subcategory + "}")) + " " +
              ("%60s" % ("{" + incident.location    + "}")));
        incident.save
      rescue Exception => exp
        puts exp
      end
    end
    
  end
end

if (ARGV.length > 0)
  # import each argument
  ARGV.each do |arg|
    import_url(arg)
  end
else
  yesterday = Date.today - 1;
  urlToImport = "http://capsnet.usc.edu/DPS/webpdf/"+
    ("%02d" % yesterday.mon) + ("%02d" % yesterday.mday) + yesterday.year.to_s[2..3] + ".pdf"
  import_url(urlToImport)
end