#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests, os

DEBUG = os.getenv("DEBUG", True)
if DEBUG:
    from datetime import datetime

# TODO: pull both reports and remote_base from environment as well? (imo unnec.)

# why are python multiline strings like this lmao
REPORTS = ("https://www.police.ucsd.edu/docs/reports/CallsandArrests/"
           "Calls_and_Arrests.asp")

# the actual location of logs sometimes differs from the report page
REMOTE_BASE = "https://www.police.ucsd.edu/docs/reports/CallsandArrests/"

# get local directory from environment; otherwise, default to current directory
LOCAL_BASE = os.getenv("LOCAL_DIR", "/srv/files/reports/") 

# filter list of remote files to skip, typically READMEs or similar files
SKIP = [""]

MONTH_DICT = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}

# we don't necessarily want our filesystem to directly match the remote one and
# we don't necessarily want to pull all files, so we preprocess filenames here
# and then do additional filtering with SKIP; this is an inelegant but compact
# workaround that lets us, for example, return "SF" for all system files that
# match a regex, then simply deny by adding "SF" to the skip list

# by default we just return path (meaning no modifications are done)
def ucsd_process(path):
    if path[:16] == "CallsForService/":
        # strip path and replace spaces
        stripped = path[16:].replace(" ", "_")

        # get raw filename
        # two decompositions: get full filename, then remove extension
        components = stripped.split("/")
        fname = components[-1].split(".")
        name_l = fname[0].split("_") # terrible name

        # replace english formatted date with ISO 8601 formatted date
        # ngl there's probably a library for this
        
        # do some verification: sometimes the filenames are wonky
        if name_l[0] in MONTH_DICT.keys() and len(name_l) == 3:
            fname[0] = f"{name_l[2]}_{MONTH_DICT[name_l[0]]}_{name_l[1][:-1]}"

        # recompose filename
        components[-1] = '.'.join(fname)
        return '/'.join(components)
    return "" # TODO: consider just returning path so we don't miss files?

# parsing an HTML file for remote files is impossible to automate, so we require
# the user to write their own function. in the future, we plan to move all
# parsing logic entirely to this processing function so that users can parse
# with regex or whatever else suits their needs.

# by default, we simply pull the hrefs from all links as a basic attempt at
# retrieving report information
def ucsd_filter(parsed):
    return [ match['value'] for match in parsed.body.find_all('option') ]

# parse the remote repository for remote report locations and then download them
def download_reports(reports, remote, local, skip=[], filter_reports=None,
                     process_path=lambda x: x, no_replace=False):
    if DEBUG:
        print(f"[{datetime.now()}] Beginning download using parameters:")
        print(f"REPORTS URL: {reports}")
        print(f"REMOTE URL: {remote}")
        print(f"LOCAL DIRECTORY: {local}")
        print(f"SKIP LIST: {skip}")
        if filter_reports: # could be in the fstring itself but extremely ugly
            print(f"FILTER FUNCTION: {filter_reports.__name__}")
        else:
            print(f"FILTER FUNCTION: default")
        print(f"PATH PROCESSING FUNCTION: {process_path.__name__}")
        print(f"REPLACEMENT DISABLED: {no_replace}")
    resp = requests.get(reports)
    parsed = BeautifulSoup(resp.content, 'lxml')
    matches = parsed.body.find_all('option')

    if not filter_reports:
        matches = [match['href'] for match in parsed.body.find('a')]
    else:
        matches = filter_reports(parsed)

    for match in matches:
        processed = process_path(match)
        if processed in SKIP:
            if DEBUG:
                print(f"[{datetime.now()}] Skipped \'{match}\'!")
            continue
        download_report(REMOTE_BASE + match, LOCAL_BASE + processed,
                        no_replace=no_replace)
        
# download a report at a specified location. if no_replace is enabled, we skip
def download_report(report, local, no_replace=False):
    # ensure local exists
    os.makedirs(os.path.dirname(local), exist_ok=True)
    
    if no_replace:
        # we are not concerned about a race condition here, since we are not
        # actually dependent on the file contents
        # specifically, we write to the file if and only if it does NOT exist
        # so there's very little here to race -- typically, we only care about
        # race conditions when reading or appending
        if os.path.exists(local):
            if DEBUG:
                print((f"[{datetime.now()}] {report} already exists on disk! "
                       "Skipping, since no_replace is enabled."))
            return

    resp = requests.get(report)
    if DEBUG:
        print(f"[{datetime.now()}] Retrieved {report}!")
    with open(local, "wb") as fin:
        fin.write(resp.content)

download_reports(REPORTS, REMOTE_BASE, LOCAL_BASE, skip=SKIP, filter_reports=ucsd_filter,
                 process_path=ucsd_process, no_replace=True)
