import pandas as pd
from tedana import workflows
import json
import os
import re
import argparse

# Use argparse to pass arguments identify where bids and fmriprep info is information
#arguments to put in the bash command
parser = argparse.ArgumentParser(
    description='Give me a path to your fmriprep output and number of cores to run')
parser.add_argument('--fmriprepDir',default=None, type=str,help="This is the full path to your fmriprep dir")
parser.add_argument('--bidsDir',default=None, type=str,help="This is the full path to your BIDS directory")
parser.add_argument('--cores',default=None, type=int,help="This is the number of parallel jobs to run")

args = parser.parse_args()
#inputs

prep_data = args.fmriprepDir
bids_dir=args.bidsDir
cores=args.cores

# # Obtain Echo files
#find the prefix and suffix to that echo of all subdirectories#
echo_images=[f for root, dirs, files in os.walk(prep_data)
             for f in files if ('_echo-' in f)& (f.endswith('_bold.nii.gz'))]

#Make a list of filenames that match the prefix from before echo-
image_prefix_list=[re.search('(.*)_echo-',f).group(1) for f in echo_images]
#discard duplicates
image_prefix_list=set(image_prefix_list)

#Make a dataframe where C1 is Sub C2 is inputFiles and C3 is Echotimes
data=[]
for acq in image_prefix_list:
    #Use RegEx to find Sub (sub- + subno from re.search)
    #sub="sub-"+re.search('sub-(.*)_task',acq).group(1)
    sub = re.search('sub-[^_]+_ses-[^_]+', acq).group(0)
    run_prefix=acq
#Make a list of the json's w/ appropriate header info from BIDS
    ME_headerinfo=[os.path.join(root, f) for root, dirs, files in os.walk(bids_dir) for f in files
               if (acq in f)& (f.endswith('_bold.json'))]

    #Read Echo times out of header info and sort
    echo_times = [json.load(open(f))['EchoTime'] for f in ME_headerinfo]
    # Current versions of TEDANA require TE in milliseconds
    #convert strign to floating number first
    echo_times= [float(x) for x in echo_times]
    echo_times= [1000*x for x in echo_times]
    echo_times.sort()
    # A quick test print warning
    if any(x < 1 for x in echo_times):
        print("Warning: Echo Times Maybe incorrectly indicated. \n The current script assumes values being read from _bold.json files contain echo times in seconds")

    #Find images matching the appropriate acq prefix
    acq_image_files=[os.path.join(root, f) for root, dirs, files in os.walk(prep_data) for f in files
              if (acq in f) & ('echo' in f) & (f.endswith('_desc-preproc_bold.nii.gz'))]
    acq_image_files.sort()

    out_dir= os.path.join(
        os.path.abspath(
            os.path.dirname( prep_data )), "tedana_AIC_new/%s"%(sub))

    #os.makedirs(out_dir, exist_ok=True)

    print(run_prefix,prep_data,out_dir)
#add all the info to data
    data.append([sub,acq_image_files,echo_times,out_dir,run_prefix])

#converts collected list into a dataframe
InData_df=pd.DataFrame(data=data,columns=['sub','EchoFiles','EchoTimes','OutDir','RunPrefix'])
args=zip(InData_df['sub'].tolist(),
           InData_df['EchoFiles'].tolist(),
           InData_df['EchoTimes'].tolist(),
           InData_df['OutDir'].tolist(),
           InData_df['RunPrefix'].tolist())

  #Changes can be reasonably made to
  #fittype: 'loglin' is faster but maybe less accurate than 'curvefit'
  #tedpca:'mdl'Minimum Description Length returns the least number of components (default) and recommeded
  #'kic' Kullback-Leibler Information Criterion medium aggression
  # 'aic' Akaike Information Criterion least aggressive; i.e., returns the most components.
  #gscontrol: post-processing to remove spatially diffuse noise. options implemented here are...
  #global signal regression (GSR), minimum image regression (MIR),
  #But anatomical CompCor, Go Decomposition (GODEC), and robust PCA can also be used

#define function to pass to multiprocess 

def RUN_Tedana(sub,EchoFiles,EchoTimes,OutDir,run_prefix):

 
    print(sub+'\n')

    expected_output_filename = f"{run_prefix}_desc-denoised_bold.nii.gz"
    expected_output_path = os.path.join(OutDir, expected_output_filename)

    # Check if the output directory exists AND the expected output file exists
    if os.path.isdir(OutDir) and os.path.exists(expected_output_path):
        print(f'Tedana analysis for Sub {sub} appears to be complete (found {expected_output_filename}). Skipping.')
    else:
        workflows.tedana_workflow(
        EchoFiles,
        EchoTimes,
        out_dir=OutDir,
        prefix="%s_space-Native"%(run_prefix),
        fittype="curvefit",
        tedpca="aic",
        verbose=True,
        gscontrol=None)

from multiprocessing import Pool

pool = Pool(cores)
results = pool.starmap(RUN_Tedana, args)