#!/bin/bash
#
# Download the PP data in the current wd given the video table file.
# Make sure you have some disk space ~1.5-2 To is ideal if you want to download everything. You 
# can interrupt this script and rerun it at a later stage, already-existing data will not be 
# re-downloaded.
# 
# 


# If you want to download everything, even videos without annotation (10x more data)
# INPUT_FILE="videos_anon_filtered_export_2023-10-03.tsv"

# If you want only annotated data (the species or "no bird")
INPUT_FILE="videos_anon_filtered_export_only_annotated_2023-10-03.tsv"

# Subset dataset if needed. Good if you want to have a test dataset quickly
SUBSET_FACTOR=1

# Max number of connections when downloading. Please be nice on our server, don't put this 
# too high (<8 should be good)
MAX_CONNECTIONS=8


i=0
total="$(( $(wc -l $INPUT_FILE | cut -d" " -f1) - 1 ))"
DLROOT="https://archive.poids-plume.fr/files/generated"

while read -r line; do
    if [ $i -eq 0 ] || [ $(( i % SUBSET_FACTOR )) -ne 0 ]; then
        i=$(( i + 1 ));
        continue
    fi

    season="$(echo "$line" | cut -f 1)"
    date="$(echo "$line" | cut -f 2)"
    path_name="$(echo "$line" | cut -f 3)"
    local_path="$(echo "$line" | cut -f 4)"
    feeder="$(echo "$line" | cut -f 5)"
    dlpath="$DLROOT/$season/videos/$feeder/$date/$path_name/video.mp4"

    # Remove "" when they are present
    dlpath=$(echo $dlpath | tr -d '"')

    # We dl an mp4 file, not a raw h264 stream
    local_path="./data/$(echo $local_path | tr -d '"' | sed 's/h264$/mp4/')"

    if [ ! -e "$local_path" ]; then
        echo "[$i/$total] [DL] $dlpath"
        mkdir -p "$(dirname "$local_path")"
        sem -j "$MAX_CONNECTIONS" curl --no-progress-meter "$dlpath" > "$local_path"
    else
        echo "[$i/$total] [--] $dlpath"
    fi

    i=$(( i + 1 ));
done < "$INPUT_FILE"
