#!/usr/bin/env bash
# =============================================================================
# onx-access-log-parse — Apache/Nginx access log parser
#
# Input (stdin JSON):
#   {
#     "username": "onx_xxxx",
#     "domain":   "example.com",
#     "period":   "today" | "24h" | "7d" | "30d",
#     "top_count": 10
#   }
#
# Behaviour:
#   1. Tries to find the access log in this order:
#        /home/$USERNAME/logs/$DOMAIN-access.log
#        /var/log/httpd/$DOMAIN-access.log
#        /var/log/nginx/$DOMAIN-access.log
#   2. Tails the last 100k lines (combined log format).
#   3. If `goaccess` is installed: pipe to it with --output-format=json
#      so we get pre-built widgets (top pages, browsers, OS, countries).
#   4. Otherwise: pure awk fallback (visits/unique IPs/top pages/refs/agents,
#      hourly distribution).
#
# Output (stdout JSON):
#   {
#     "domain": "...", "log_file": "...", "lines_scanned": 87421,
#     "total_visits": 12453, "unique_visitors": 4321,
#     "hourly": [{"hour": "00:00", "visits": 120}, ...],
#     "top_pages": [{"path": "/", "hits": 4521}, ...],
#     "top_referrers": [{"referrer": "google.com", "hits": 215}, ...],
#     "top_user_agents": [{"agent": "Chrome", "hits": 5210}, ...],
#     "top_countries": [],
#     "engine": "goaccess" | "awk"
#   }
#
# Exit codes: 0=ok 1=invalid-input 2=preflight-fail 3=execution-fail
# Deployed to: /usr/local/onoxsoft/bin/onx-access-log-parse
# =============================================================================

set -euo pipefail

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "${SCRIPT_DIR}/_lib/common.sh"

require_cmd jq
require_cmd awk

INPUT=$(cat)
onx_require_json "${INPUT}"

USERNAME=$(onx_json_get "${INPUT}" "username")
DOMAIN=$(onx_json_get   "${INPUT}" "domain")
PERIOD=$(onx_json_get   "${INPUT}" "period" "24h")
TOP_COUNT=$(onx_json_get "${INPUT}" "top_count" "10")

onx_validate_username "${USERNAME}"
onx_validate_domain   "${DOMAIN}"

# ── Locate log file ──────────────────────────────────────────────────────────
LOG_FILE=""
for candidate in \
    "/home/${USERNAME}/logs/${DOMAIN}-access.log" \
    "/var/log/httpd/${DOMAIN}-access.log" \
    "/var/log/nginx/${DOMAIN}-access.log" \
    "/home/${USERNAME}/logs/${DOMAIN}_access_log"; do
    if [[ -r "${candidate}" ]]; then
        LOG_FILE="${candidate}"
        break
    fi
done

if [[ -z "${LOG_FILE}" ]]; then
    # Log missing is not a hard error — return empty stats so the UI degrades gracefully
    printf '{"domain":"%s","log_file":null,"lines_scanned":0,"total_visits":0,"unique_visitors":0,"hourly":[],"top_pages":[],"top_referrers":[],"top_user_agents":[],"top_countries":[],"engine":"none","note":"access log not found"}\n' "${DOMAIN}"
    exit 0
fi

# ── Period → cutoff line count (approx) ──────────────────────────────────────
case "${PERIOD}" in
    today|24h) LINES=100000 ;;
    7d)        LINES=500000 ;;
    30d)       LINES=1500000 ;;
    *)         LINES=100000 ;;
esac

TMPSCAN="$(mktemp -t onx-alog.XXXXXX)"
trap 'rm -f "${TMPSCAN}" 2>/dev/null || true' EXIT

tail -n "${LINES}" "${LOG_FILE}" > "${TMPSCAN}" 2>/dev/null || true
SCANNED=$(wc -l < "${TMPSCAN}" 2>/dev/null || echo 0)

ENGINE="awk"

# ── Pure-awk parse (always run; goaccess is opt-in widget enrichment) ────────
TOTAL_VISITS=$(wc -l < "${TMPSCAN}" 2>/dev/null || echo 0)
UNIQUE_VISITORS=$(awk '{print $1}' "${TMPSCAN}" 2>/dev/null | sort -u | wc -l || echo 0)

# Hourly distribution
HOURLY_JSON=$(awk -F'[][]' '
    {
        ts = $2
        if (ts == "") next
        # ts like: 13/May/2026:08:42:12 +0000
        split(ts, a, ":")
        hour = a[2]
        bucket[hour]++
    }
    END {
        for (h=0; h<24; h++) {
            key = sprintf("%02d", h)
            cnt = bucket[key] + 0
            printf "%s{\"hour\":\"%s:00\",\"visits\":%d}", (h==0?"":","), key, cnt
        }
    }
' "${TMPSCAN}" 2>/dev/null || true)
HOURLY_JSON="[${HOURLY_JSON}]"

# Top pages — column 7 of combined log
TOP_PAGES_JSON=$(awk '{print $7}' "${TMPSCAN}" 2>/dev/null \
    | grep -v '^-$' \
    | sort | uniq -c | sort -rn | head -n "${TOP_COUNT}" \
    | awk 'BEGIN {printf "["} NR>1 {printf ","} {gsub(/"/,"\\\"", $2); printf "{\"path\":\"%s\",\"hits\":%d}", $2, $1} END {printf "]"}' \
    || echo '[]')

# Top referrers — column 11
TOP_REF_JSON=$(awk -F'"' '{print $4}' "${TMPSCAN}" 2>/dev/null \
    | grep -v '^-$' | grep -v '^$' \
    | awk -F/ '{print $3}' \
    | sort | uniq -c | sort -rn | head -n "${TOP_COUNT}" \
    | awk 'BEGIN {printf "["} NR>1 {printf ","} {gsub(/"/,"\\\"", $2); printf "{\"referrer\":\"%s\",\"hits\":%d}", $2, $1} END {printf "]"}' \
    || echo '[]')

# Top user agents — column 12 (simplified to browser family)
TOP_UA_JSON=$(awk -F'"' '{print $6}' "${TMPSCAN}" 2>/dev/null \
    | awk '{
        if ($0 ~ /Edg\//)      print "Edge"
        else if ($0 ~ /OPR\//) print "Opera"
        else if ($0 ~ /Chrome\//) print "Chrome"
        else if ($0 ~ /Firefox\//) print "Firefox"
        else if ($0 ~ /Safari\//) print "Safari"
        else if ($0 ~ /bot|Bot|spider|Spider/) print "Bot"
        else print "Other"
    }' \
    | sort | uniq -c | sort -rn | head -n "${TOP_COUNT}" \
    | awk 'BEGIN {printf "["} NR>1 {printf ","} {gsub(/"/,"\\\"", $2); printf "{\"agent\":\"%s\",\"hits\":%d}", $2, $1} END {printf "]"}' \
    || echo '[]')

# ── Optional GoAccess enrichment (top countries, OS, devices) ────────────────
TOP_COUNTRIES_JSON='[]'
if command -v goaccess >/dev/null 2>&1; then
    ENGINE="goaccess+awk"
    GA_JSON=$(goaccess "${TMPSCAN}" --no-global-config --output-format=json \
        --log-format=COMBINED 2>/dev/null || echo '{}')
    # Extract just country/geo if available; otherwise leave empty
    COUNTRIES_RAW=$(echo "${GA_JSON}" | jq -r '.visitors.data // [] | map({code:.data, hits:.hits.count // 0})' 2>/dev/null || echo '[]')
    if [[ -n "${COUNTRIES_RAW}" && "${COUNTRIES_RAW}" != "null" ]]; then
        TOP_COUNTRIES_JSON="${COUNTRIES_RAW}"
    fi
fi

onx_log "access-log-parse: ${DOMAIN} lines=${SCANNED} visits=${TOTAL_VISITS} engine=${ENGINE}"

cat <<EOF
{"domain":"${DOMAIN}","log_file":"${LOG_FILE}","lines_scanned":${SCANNED},"total_visits":${TOTAL_VISITS},"unique_visitors":${UNIQUE_VISITORS},"hourly":${HOURLY_JSON},"top_pages":${TOP_PAGES_JSON},"top_referrers":${TOP_REF_JSON},"top_user_agents":${TOP_UA_JSON},"top_countries":${TOP_COUNTRIES_JSON},"engine":"${ENGINE}"}
EOF
