#! /usr/bin/env Rscript
# Time-stamp: <2019-11-06 14:02:39 christophe@pallier.org>


require("rjson")
require("tools") # Required for md5sum

#  Download openlexicon's datasets from a json file using 'dafter' syntax (see https://github.com/vinzeebreak/dafter/)

# Remote dir containing the json files describing the datasets, use *raw* github
default_remote <- "https://raw.githubusercontent.com/chrplr/openlexicon/master/datasets-info/_json/"


# Usage:
# source('https://raw.githubusercontent.com/chrplr/openlexicon/master/datasets-info/fetch_datasets.R')
# lexique <- get_lexique382()
#  or
# uscorpus <- readRDS(fetch_dataset('SUBTLEX-US-corpus', format='rds')$datatables[[1]])




fetch_dataset <- function(dataset_id, location=default_remote, filename=NULL, format=NULL)
# download, only if needed, a dataset from openlexicon databases
# returns a list with information about the dataset and a list of local filenames containing the datatables):
## list(name=dataset_id,
##      datatables=tables,
##      description=description,
##      readme=readme,
##      website=website)
{
    destname <- ''

    json_file <- paste(location, dataset_id, '.json', sep="")

    json_data <- fromJSON(file=json_file)
    description <- json_data$description
    readme <- json_data$readme
    website <- json_data$website

    tables = list()
    for (u in json_data$urls)
    {
        fname <- basename(u$url)

        if (!is.null(filename) && (filename != fname))
            next  # skip this file

        if (!is.null(format) && tools::file_ext(fname) != format)   # check if format (extension) matches
            next  # skip this file

        destname <- file.path(get_data.home(), fname)
        warning(paste("Downloading in ", destname))

        if (!file.exists(destname))
        {
            download.file(u$url, destname, mode='wb')
            if (md5sum(destname) != u$md5sum)
            {
              warning("Something is wrong: the md5sums don't match. Either the upstream files are inconsistent or someone is messing with your internet connection.")
            } else
            {
                print(paste("File", destname, "downloaded without issue."))
                tables <- append(tables, destname)
            }
        }  else  # The local file exists
        {
            if (md5sum(destname) != u$md5sum) {
                warning(paste("the md5 sum of your local file", destname, md5sum(destname), "doesn't match the distant version", u$md5sum, ". Aborting. Delete the local file if necessary"))
            }
            else
            {
                warning(paste("You already have the file", destname, "which is up to date."))
                tables <- append(tables, destname)
            }
        }
    }
    if (length(tables) == 0)
    {
        warning("could not find a file with a matching format")
    }
    list(name=dataset_id,
         datatables=tables,
         description=description,
         readme=readme,
         website=website)
}

get_data.home <- function()
{
    data.home <- Sys.getenv('OPENLEXICON_DATASETS')
    xdg.data.home <- Sys.getenv('XDG_DATA_HOME')

    if (data.home == "")
    {
        if (xdg.data.home == "")
        {
            data.home <- file.path(path.expand('~'), 'openlexicon_datasets')
        } else {
            data.home <- file.path(xdg.data.home, 'openlexicon_datasets')
        }
    }
    dir.create(data.home, showWarnings=FALSE, recursive=TRUE)
    data.home
}

get_FreqSubImage1916 <- function()
{
    info <- fetch_dataset('FreqSub Image 1916', format='rds')
    readRDS(info$datatables[[1]])
}

get_lexique382 <- function()
{
    info <- fetch_dataset('Lexique382', format='rds')
    readRDS(info$datatables[[1]])
}

get_lexique383 <- function()
{
    info <- fetch_dataset('Lexique383', format='rds')
    readRDS(info$datatables[[1]])
}

get_lexique3 <- function()
{
    info <- fetch_dataset('Lexique383', format='rds')
    readRDS(info$datatables[[1]])
}


get_worldlex.french <- function()
{
    info <- fetch_dataset('WorldLex-French', format='rds')
    readRDS(info$datatables[[1]])
}

get_worldlex.english <- function()
{
    info <- fetch_dataset('WorldLex-English', format='rds')
    readRDS(info$datatables[[1]])
}

get_subtlex.us <- function()
{
    info <- fetch_dataset('SUBTLEX-US', format='rds')
    readRDS(info$datatables[[1]])

}

get_aoa32 <- function()
{
    info <-  fetch_dataset('AoA-32lang', format='tsv')
    read.table(info$datatables[[1]], header=TRUE, sep='\t')
}

