#!/toc/home/edemaine/bin/python
import glob, operator, os, re, sys

target_dir = "/scratch/edemaine/all_papers"
src_spec = "/toc/home/edemaine/public_html/papers/*/paper.ps"
ignore_re = r"technical report|manuscript|thesis"
def index_file (paperps): return paperps.replace ("paper.ps", "index.html")

#os.mkdir (target_dir)
src_files = glob.glob (src_spec)

def make_tag (index_file):
  index = file (index_file, "r")
  on = 0
  stuff = None
  while 1:
    line = index.readline ()
    if not line: break
    if re.search (r"<b>Reference</b>", line, re.I):
      line = index.readline ()
      match = re.search (r"``.*?''\s*(?:in\s*)?(.*?)</dd>", line, re.I)
      if not match:
        raise "failed looking at %s" % line
      stuff = match.group (1)
      stuff = re.sub (r"</I>,", ".", stuff, 1)  ## cause line break after italic
      stuff = re.sub (r"<[^<>]*>", "", stuff)  ## remove HTML
      stuff = re.sub (r"&(\w)(acute|grave|uml);", r"\1", stuff)  ## remove accents
      stuff = re.sub (r"&amp;", r"&", stuff)
      stuff = re.sub (r"edited by.*?(Lecture Notes)", r"\1", stuff) ## remove editors
      stuff = re.sub (r"edited by.*?,", r"", stuff) ## remove editors
      stuff = filter (None, [x.strip () for x in stuff.split (".")])
      if re.search (ignore_re, stuff[0], re.I):
        return
  index.close ()
  return stuff

def run (program, args):
  if os.fork () == 0:
    os.execvp (program, [program] + args)
    sys.exit (1)
  else:
    return os.wait ()

count = 0
for src_file in src_files:
  tag = make_tag (index_file (src_file))
  if tag is None:
    print "*** skipping", src_file, tag
    continue
  print "%d. %s -> %s" % (count, src_file, tag)
  target = os.path.join (target_dir, "%04d.ps" % count)
  run ("cp", [src_file, target])
  run ("tagps", reduce (operator.add, [["--tag", x] for x in tag]) + [target])
  count += 1
