#
# PFAS.R    05/07/2024    Ralf Tautenhahn
#
# v1.10
#
#
# m/C and md/C ratio calculation is based on Kaufmann et al. https://doi.org/10.1093/jaoacint/qsac071
#
# Creates columns "eC" (estimated number of carbons based on A1/A0 ratio), 
#                 "m/C" (molecular mass / eC)
#                 "md/C" (mass defect / eC)
#
#                 "F", the number of fluorine atoms in assigned Formula
#                 "maxF(EC)": maximum number of fluorine atoms out of all predicted compositions for this compound
#                 "maxF(CS)": maximum number of fluorine atoms out of all ChemSpider Hits for this compound
#                 "maxF(ML)": maximum number of fluorine atoms out of all Mass List Hits for this compound
#                 
#
#
# 1.00336
#
#
# Version 1.7  :  small fix due to changes in the database format in CD3.3SP3
# Version 1.8  :  m/z column was missing in export options
# Version 1.9  :  now uses library xml2 instead of library XML to reduce memory usage
# Version 1.10 :  Performance improvements
#

A1LowTol = 1.000
A1HighTol = 1.006
mzTOL = 0.01


getTableIdx <- function(JSON_in, name) {
  TableNames <- sapply (JSON_in$Tables, function(x) x$TableName)
  which(TableNames %in% name)
}
  
getTable <- function(JSON_in, name) {
  
  TableNames <- sapply (JSON_in$Tables, function(x) x$TableName)
  idx <- TableNames %in% name
  
  if (!any(idx)) 
    stop("Table not found.")
  
  read.table(JSON_in$Tables[[ which(idx) ]]$DataFile, 
             header=TRUE, check.names = FALSE, stringsAsFactors = FALSE)
}

getNumberOfElementsFromFormula <- function(formula, element) {
  require(stringr)
  
  if (str_detect(formula,element)) {
    n <- as.integer(str_remove(str_extract(formula, paste(element,"[0-9]+", sep="")),element))
    if (is.na(n))
      return(1)
    else
      return(n)
  }
  return(0)
}

calcmCmD <- function(ci, Compounds, cdresult.file, mzTOL, A1LowTol, A1HighTol) {
  
  getSpectrumXML2 <- function(zspec, tmpZipName="spec.zip") {
    require(xml2)
    
    tmpd <- tempdir()
    setwd(tmpd)
    
    fcon <- file (tmpZipName,"w+b")  # a bit of a workaround since memDecompress() does not work zip files
    writeBin(zspec, con = fcon )
    close(fcon)
    flist <- unzip(tmpZipName, list=TRUE)
    unzip(tmpZipName)
    unlink(fcon)
    
    xmlfile <- flist[1,"Name"]
    xml <- xml2::read_xml(xmlfile)
    xml_data <- xml2::as_list(xml)
    centroidList <- lapply(xml_data$MassSpectrum$PeakCentroids, attributes )
    spectrum <-  t(sapply(centroidList, function(x) c("mz" = as.numeric(x$X), "intensity" = as.numeric(x$Y), "SN" = as.numeric(x$SN)) ))
    rownames(spectrum) <- NULL
    spectrum <- as.data.frame(spectrum)
    
    unlink(xmlfile)
    
    return(spectrum)  
  }
  
  getBestScanMS1 <- function(CID, cdresult.file) {
    require(RSQLite)
    
    
    ## public enum BestHitType
    ##
    ## Marks the best available MS1.
    ##
    ##  [DisplayName("Best MS1")]
    ## BestMS1 = 0x00000001
    
    ## Marks the best available MS2.
    ##
    ##  [DisplayName("Best MS2")]
    ## BestMS2 = 0x00000002
    
    query <- paste('SELECT MassSpectrumInfoItemsID
                  FROM BestHitIonInstanceItemsMassSpectrumInfoItems, ConsolidatedUnknownCompoundItemsBestHitIonInstanceItems, ConsolidatedUnknownCompoundItems, BestHitIonInstanceItems
                  WHERE BestHitIonInstanceItemsMassSpectrumInfoItems.BestHitIonInstanceItemsID = ConsolidatedUnknownCompoundItemsBestHitIonInstanceItems.BestHitIonInstanceItemsID
                  AND ConsolidatedUnknownCompoundItems.ID = ConsolidatedUnknownCompoundItemsBestHitIonInstanceItems.ConsolidatedUnknownCompoundItemsID
                  AND BestHitIonInstanceItems.ID = ConsolidatedUnknownCompoundItemsBestHitIonInstanceItems.BestHitIonInstanceItemsID
                  AND BestHitIonInstanceItems.BestHitType = 1
                  AND ConsolidatedUnknownCompoundItems.ID =', CID )
    
    drv <- dbDriver("SQLite")
    con <- dbConnect(drv, cdresult.file)
    
    qres=dbGetQuery(con,query)
    if (nrow(qres) < 1) return(0)
    if (nrow(qres) > 1) stop("Error: More than one best MS1 scan ?")
    
    q2=paste("SELECT Spectrum FROM MassSpectrumItems WHERE ID=", qres[1,"MassSpectrumInfoItemsID"], sep="" )
    
    sblob=dbGetQuery(con,q2)
    
    dbDisconnect(con)
    
    spec <- getSpectrumXML2(zspec = sblob$Spectrum[[1]])
    
    spec
  }
  
  
  cid <- Compounds[ci,"Compounds ID"]
  MS1Scan <- getBestScanMS1(cid, cdresult.file)
  
  mz <- Compounds[ci,"m/z"]
  
  if (is.null(mz)) 
    stop("No m/z value ?")
  
  A0idx <- which(MS1Scan[,"mz"] <= mz + mzTOL & MS1Scan[,"mz"] >= mz - mzTOL)
  
  if (length(A0idx) > 1) 
    A0idx= A0idx[which.max(MS1Scan[A0idx,"intensity"])]
  
  A0int <- MS1Scan[A0idx,"intensity"]
  
  A1idx <- which(MS1Scan[,"mz"] <= mz + A1HighTol & MS1Scan[,"mz"] >= mz + A1LowTol)
  if (length(A1idx) > 1) 
    A1idx= A1idx[which.max(MS1Scan[A1idx,"intensity"])]
  
  #calculate the A1/A0 ratio
  # " C = 100 * abundance of the first isotopic peak / abundance of the corresponding monoisotopic peak / 1.1145 " https://doi.org/10.1093/jaoacint/qsac071
  
  if (length(A0idx) > 0 && length(A1idx) > 0) {
    A1int <- MS1Scan[A1idx,"intensity"]
    ec <- 100 * (A1int / A0int) / 1.1145
  } else {
    ec <- 0
  }
  
  if (ec > 0) {
    mC <- Compounds[ci,"Calc. MW"] / ec
    
    md <- Compounds[ci,"Calc. MW"] - floor(Compounds[ci,"Calc. MW"])
    if (md > 0.8)
      md <- -(1 - md)
    
    mdC <- md / ec
  } else {
    mC <- mdC <- 0
  }
  
  return(c("eC"=ec, "mC"=mC , "mdC" = mdC))
  
}


# Read arguments from CD.
args <- commandArgs()

# At least for now, the 6th argument is the name of the JSON file
inputFile <- args[6]

# Open JSON file, find exported files, read into tables
library(rjson)
CD_json_in <- fromJSON(file=inputFile)

# need RSQLite to access CD result file, stringr to parse formulas, and XML to decode spectrum stored in XML format
library(stringr)
library(RSQLite)
library(xml2)

cdresult.file=CD_json_in$ResultFilePath

Compounds <- getTable(CD_json_in, "Compounds")
PredictedCompositions <- getTable(CD_json_in, "Predicted Compositions")
Compounds_PredictedCompositions <- getTable(CD_json_in, "ConsolidatedUnknownCompoundItem-PredictedCompositionItem")
ChemSpiderResults <- getTable(CD_json_in,"ChemSpider Results")
Compounds_ChemSpiderResults <- getTable(CD_json_in,"ConsolidatedUnknownCompoundItem-ChemSpiderResultItem")
MassListSearchResults <- getTable(CD_json_in,"Mass List Search Results")
Compounds_MassListSearchResults <- getTable(CD_json_in,"ConsolidatedUnknownCompoundItem-MassListSearchItem")

# for debugging only 
# save.image(file="C:\\TEMP\\CD node Rimage_PFAS.dat")
# load("C:\\TEMP\\CD node Rimage_PFAS.dat")

# number of fluorine atoms in assigned Formula F, maximum number of fluorines in all predicted compositions maxFeC, ChemSpider hits maxFCS, Mass List hits maxFML
FeC <- maxFeC <- maxFCS <- maxFML <- integer(nrow(Compounds))

#  
# calculate m/C ratio and md/C ratio
# ec, mC, mdC : estimated #C based on A1/A0 ratio, m/C ratio and md/C ratio
#
library(parallel)
cl <- makeCluster(detectCores())
mres <- t(parSapply(cl, X=1:nrow(Compounds), FUN=calcmCmD, Compounds, cdresult.file,mzTOL=mzTOL, A1LowTol=A1LowTol, A1HighTol=A1HighTol, USE.NAMES=FALSE))
stopCluster(cl)


## loop through all Compounds
for (ci in 1:nrow(Compounds)) {

  cid <- Compounds[ci,"Compounds ID"]

  #
  # calculate FeC, the number of fluorine atoms in assigned Formula
  #
  FeC[ci] <- getNumberOfElementsFromFormula(Compounds[ci,"Formula"], "F")
  
  #
  # calculate maxF(EC): maximum number of fluorine atoms out of all predicted compositions for this compound
  #
  
  PredictedCompositionsIdx <- which(Compounds_PredictedCompositions[,'Compounds ID'] %in% cid)
  if (length(PredictedCompositionsIdx) == 0) {
    maxFeC[ci] <- 0
  } else {
    PredictedCompositionsIDs <- Compounds_PredictedCompositions[PredictedCompositionsIdx , "Predicted Compositions ID"]
    formulas <- PredictedCompositions[PredictedCompositions[,"Predicted Compositions ID" ] %in%  PredictedCompositionsIDs , "Formula"]
     nF <- sapply(formulas, getNumberOfElementsFromFormula, "F" )
    maxFeC[ci] <- max(nF)
  }
    
  #
  # calculate maxF(CS): maximum number of fluorine atoms out of all ChemSpider Hits for this compound
  #
  ChemSpiderIdx <- which(Compounds_ChemSpiderResults[,'Compounds ID'] %in% cid)
  if (length(ChemSpiderIdx) == 0) {
    maxFCS[ci] <- 0
  } else {
    ChemSpiderIDs <- Compounds_ChemSpiderResults[ChemSpiderIdx , "ChemSpider Results CSID"]
    formulas <- ChemSpiderResults[ChemSpiderResults[,"ChemSpider Results CSID" ] %in%  ChemSpiderIDs , "Formula"]
    nF <- sapply(formulas, getNumberOfElementsFromFormula, "F" )
    maxFCS[ci] <- max(nF)
  }
 
  #
  # calculate maxF(ML): maximum number of fluorine atoms out of all Mass List Hits for this compound
  #
  MassListIdx <- which(Compounds_MassListSearchResults[,'Compounds ID'] %in% cid)
  if (length(MassListIdx) == 0) {
    maxFML[ci] <- 0
  } else {
    MassListIDs <- Compounds_MassListSearchResults[MassListIdx , "Mass List Search Results ID"]
    formulas <- MassListSearchResults[MassListSearchResults[, "Mass List Search Results ID" ] %in%  MassListIDs , "Formula"]
    nF <- sapply(formulas, getNumberOfElementsFromFormula, "F" )
    maxFML[ci] <- max(nF)
  }
  
    
}  


# add result column to table
data.output <- cbind(Compounds, "eC" = mres[,"eC"], "m/C" = mres[,"mC"], "md/C" = mres[,"mdC"], "F"=FeC, "maxF(EC)"=maxFeC, "maxF(CS)" = maxFCS, "maxF(ML)" = maxFML)

# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "eC"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Float"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn

# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "m/C"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Float"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn

# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "md/C"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Float"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn


# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "F"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Int"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn


# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "maxF(EC)"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Int"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn


# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "maxF(CS)"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Int"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn

# Add new column to JSON structure.
newcolumn <- list()
newcolumn[[1]] = "maxF(ML)"     ## ColumnName
newcolumn[[2]] = ""     ## IsID
newcolumn[[3]] = "Int"      ## DataType
newcolumn[[4]] <- list(PositionAfter="Formula")

names(newcolumn) <- c("ColumnName", "ID", "DataType", "Options")
CD_json_in$Tables[[1]]$ColumnDescriptions[[length(CD_json_in$Tables[[1]]$ColumnDescriptions) + 1]] <- newcolumn


# Write modified Compounds table to temporary folder.
Compounds.idx <- getTableIdx(CD_json_in, "Compounds")
Compounds.datafile <- CD_json_in$Tables[[ Compounds.idx  ]]$DataFile
resultout <- gsub(".txt", ".out.txt", Compounds.datafile)
write.table(data.output, file = resultout, sep='\t', row.names = FALSE)

# Write out node_response.json file - use same file as node_args.json but change the pathway input file to the new one

CD_json_in$Tables[[Compounds.idx]]$DataFile = resultout
jsonOutFile <- CD_json_in$ExpectedResponsePath

# Remove all the other tables in the JSON so that only the new Compounds table is used
for (j in seq(length(CD_json_in$Tables),2,-1) ) 
  CD_json_in$Tables[j] <- NULL;

responseJSON <- toJSON(CD_json_in, indent=1, method="C")

# responseJSON has incorrect format for the empty Options lists.  Will use a regular expression to find and replace the [\n\n\] with the {}

responseJSON2 <- gsub("\\[\n\n[[:blank:]]+\\]", "{ }", responseJSON)

jsonfileconn <- file(jsonOutFile)

writeLines(responseJSON2, jsonfileconn)

close (jsonfileconn)



