#! /usr/bin/env bash
#
# Fetch git projects from a list and run our parser on the files for
# a language of interest.
#
set -eu -o pipefail

progdir=$(dirname "$0")
progname=$(basename "$0")

usage() {
  cat <<EOF
Usage: $progname [--upload] [--semgrep-core CMD_NAME] LANG

Expects:
- lang/LANG/projects.txt: contains one git URL per line
- semgrep-core command must be available unless an alternative is specified
  with '--semgrep-core'.

Produces lang/LANG/stats.json.

Examples:
  $progname java
  $progname apex --semgrep-core semgrep-core-proprietary
EOF
}

error() {
  echo "Error: $*" >&2
  exit 1
}

projects_file=projects.txt
lang=""
with_upload=false
semgrep_core="semgrep-core"

while [[ $# != 0 ]]; do
  case "$1" in
    --help)
      usage
      exit 0
      ;;
    --semgrep-core)
      semgrep_core=$2
      shift
      ;;
    --upload)
      with_upload=true
      ;;
    -*)
      error "Unsupported option: '$1'"
      ;;
    *)
      if [[ -n "$lang" ]]; then
        error "Don't know what to do with extra non-optional argument: '$1'"
      else
        lang=$1
      fi
  esac
  shift
done

if [[ -z "$lang" ]]; then
  error "Missing language argument"
fi

fetch_project_files() {
  # Note that project names are not unique
  project=$(basename "${url%.git}")
  org=$(basename $(dirname "${url%.git}"))
  name="$org-$project"
  project_list+=" $name"

  mkdir -p tmp
  (
    cd tmp
    if [[ ! -d "$name" ]]; then
      echo "Cloning '$name' from '$url'."
      # Since we do not care about revision history or
      # Git Large File Storage files, we can shallow clone
      # and ignore LFS pointers to expedite cloning.
      GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 "$url" "$name"
    else
      echo "Using local git repo for '$name'."
      origin_url=$(git -C "$name" remote get-url origin)
      if [[ "$url" != "$origin_url" ]]; then
        cat >&2 <<EOF
Wrong remote URL found in cloned repository '$name':
  found $origin_url
  expected $url
Check that you don't have two project URLs with the same repo name.
EOF
        exit 1
      fi
    fi
  )
}

main() {
  if sc=$(which "$semgrep_core"); then
    echo "semgrep-core is: $sc"
  else
    error "failed to locate $semgrep_core"
  fi
  (
    cd lang/"$lang"

    url_list=$(grep -v '^ *\(#\| *$\)' "$projects_file")

    # Run the stats on each git project
    project_list=""
    for url in $url_list; do
      fetch_project_files
    done

    (
      cd tmp
      # old: Set memory limit to avoid killing other processes that run in
      # parallel if this semgrep-core instance uses too much memory.
      ### ulimit -v 4000000  # KiB
      # TODO: we now implement the limit in Test_parsing.parsing_common
      # because the use of 'ulimit -v' above triggers some unrecoverable
      # 'Fatal error: out of memory' that even Memory_limit can't intercept.
      # See the long comment in Test_parsing.parsing_common about
      # mem_limit_mb.
      "$semgrep_core" -lang "$lang" -parsing_stats -json $project_list
    ) > stats.json 2> stats.log
  )

  if [[ "$with_upload" = true ]]; then
    ./upload-parsing-rates lang/"$lang"/stats.json
  fi
  echo "stats available in lang/$lang/stats.json"
  cat lang/$lang/stats.json | jq .global
}

main
