Mercurial > repos > ecology > vigiechiro_idvalid

diff IdValid.R @ 1:1a77241a9fab draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tools/vigiechiro commit 7ef0e58cbcbf41088e359f00b6c86504c773c271
author: ecology
date: Fri, 26 Apr 2019 12:15:41 -0400
parents: aa3afb770148
--- a/IdValid.R	Wed Mar 13 11:17:15 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,165 +0,0 @@
-#!/usr/bin/env Rscript
-
-suppressMessages(library(data.table))
-
-ValidHier=function(x,y) #used to write validator id over observer id
-{
-  #cat(y)
-  if(is.na(y)){x}else{y}
-}
-
-f2p <- function(x) #get date-time data from recording file names
-{
-  if (is.data.frame((x)[1])) {pretemps <- vector(length = nrow(x))}
-  op <- options(digits.secs = 3)
-  pretemps <- paste(substr(x, nchar(x) - 18, nchar(x)-4), ".", substr(x, nchar(x) - 2, nchar(x)), sep = "")
-  strptime(pretemps, "%Y%m%d_%H%M%OS",tz="UTC")-7200
-}
-
-args <- commandArgs(trailingOnly = TRUE)
-
-
-IdCorrect=fread(args[1])
-RefSeuil=fread(args[2])
-#IdV=as.data.frame(subset(IdCorrect,select=observateur_taxon:validateur_probabilite))
-
-#Step 0 :compute id score from 2nd Layer
-test=match("participation",names(IdCorrect))
-IdCorrect$IdScore=apply(as.data.frame(IdCorrect)[,(test+1):(ncol(IdCorrect)-1)],MARGIN=1,FUN=max)
-#compute true success probabilities according to logistic regression issued from "Referentiel_seuils"
-CorrSp=match(IdCorrect$ProbEsp_C2bs,RefSeuil$Espece)
-PSp=RefSeuil$Pente[CorrSp]
-ISp=RefSeuil$Int[CorrSp]
-suppressWarnings(IdCorrect$IdProb<-mapply(FUN=function(w,x,y) if((!is.na(y))&(y>0)&(y<1000)) {(exp(y*w+x)/(1+exp(y*w+x)))}else{w} ,IdCorrect$IdScore,ISp,PSp))
-
-#Step 1 :compute id with confidence regarding a hierarchy (validator > observer)
-IdCorrect$IdV=mapply(ValidHier,IdCorrect$observateur_taxon,IdCorrect$validateur_taxon)
-IdCorrect$ConfV=mapply(ValidHier,IdCorrect$observateur_probabilite
-                       ,IdCorrect$validateur_probabilite)
-
-
-
-#Step 2: Get numerictime data
-suppressWarnings(IdCorrect$Session<-NULL)
-suppressWarnings(IdCorrect$TimeNum<-NULL)
-
-if (substr(IdCorrect$`nom du fichier`[1],2,2)=="i") #for car/walk transects
-{
-  FileInfo=as.data.table(tstrsplit(IdCorrect$`nom du fichier`,"-"))
-  IdCorrect$Session=as.numeric(substr(FileInfo$V4,5,nchar(FileInfo$V4)))
-  TimeSec=as.data.table(tstrsplit(FileInfo$V5,"_"))
-  TimeSec=as.data.frame(TimeSec)
-  if(sum(TimeSec[,(ncol(TimeSec)-1)]!="00000")==0) #to deal with double Kaleidoscope treatments
-  {
-    print("NOMS DE FICHIERS NON CONFORMES")
-    print("Vous les avez probablement traiter 2 fois par Kaleidoscope")
-    stop("Merci de nous signaler cette erreur par mail pour correction")
-      }else{
-        IdCorrect$TimeNum=(IdCorrect$Session*800
-        +as.numeric(TimeSec[,(ncol(TimeSec)-1)])
-        +as.numeric(TimeSec[,(ncol(TimeSec))])/1000)
-  }
-  
-}else{
-  if(substr(IdCorrect$`nom du fichier`[1],2,2)=="a") #for stationary recordings
-  {
-    DateRec=as.POSIXlt(f2p(IdCorrect$`nom du fichier`))
-    Nuit=format(as.Date(DateRec-43200*(DateRec$hour<12)),format="%d/%m/%Y")
-    #Nuit[is.na(Nuit)]=0
-    IdCorrect$Session=Nuit
-    IdCorrect$TimeNum=as.numeric(DateRec)
-    
-    }else{
-      print("NOMS DE FICHIERS NON CONFORMES")
-       stop("Ils doivent commencer par Cir (routier/pedestre) ou par Car (points fixes")
-    }
-}
-
-
-
-
-#Step 3 :treat sequentially each species identified by Tadarida-C
-IdExtrap=vector() #to store the id extrapolated from validations
-IdC2=IdCorrect[0,] #to store data in the right order
-TypeE=vector() #to store the type of extrapolation made
-for (j in 1:nlevels(as.factor(IdCorrect$ProbEsp_C2bs)))
-{
-  IdSp=subset(IdCorrect
-              ,IdCorrect$ProbEsp_C2bs==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
-  if(sum(is.na(IdSp$IdV))==(nrow(IdSp))) #case 1 : no validation no change
-  {
-    IdC2=rbind(IdC2,IdSp)
-    IdExtrap=c(IdExtrap,rep(IdSp$ProbEsp_C2bs[1],nrow(IdSp)))
-    TypeE=c(TypeE,rep(0,nrow(IdSp)))
-  }else{ #case 2: some validation
-    Vtemp=subset(IdSp,is.na(IdSp$IdV))
-      #case2A: validations are homogeneous
-    if(nlevels(as.factor(Vtemp$IdV))==1)
-    {
-      IdC2=rbind(IdC2,IdSp)
-      IdExtrap=c(IdExtrap,rep(Vtemp$IdV[1],nrow(IdSp)))
-      TypeE=c(TypeE,rep(2,nrow(IdSp)))
-    }else{
-      #case 2B: validations are heterogeneous
-      #case 2B1: some validations confirms the species identified by Tadarida and highest confidence are confirmed
-      subVT=subset(Vtemp,Vtemp$IdV==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
-      subVF=subset(Vtemp,Vtemp$IdV!=levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
-      if((nrow(subVT)>0)&(max(subVT$IdProb)>max(subVF$IdProb)))
-      {
-        Vtemp=Vtemp[order(Vtemp$IdProb),]
-        test=(Vtemp$IdV!=Vtemp$ProbEsp_C2bs)
-        Fr1=max(which(test == TRUE)) #find the error with highest indices
-        Thr1=mean(Vtemp$IdProb[(Fr1):(Fr1+1)]) #define first threshold as the median confidence between the first error and the confirmed ID right over it
-        #id over this threshold are considered right
-        IdHC=subset(IdSp,IdSp$IdProb>Thr1)
-        IdC2=rbind(IdC2,IdHC)
-        IdExtrap=c(IdExtrap,rep(Vtemp$IdV[nrow(Vtemp)],nrow(IdHC)))
-        TypeE=c(TypeE,rep(2,nrow(IdHC)))
-        #id under this threshold are attributed to validated id closest in time
-        Vtemp=Vtemp[order(Vtemp$TimeNum),]
-        cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
-        CorrV=findInterval(IdSp$TimeNum, cuts)
-        IdE=Vtemp$IdV[CorrV]
-        IdEL=subset(IdE,IdSp$IdProb<=Thr1)
-        IdLC=subset(IdSp,IdSp$IdProb<=Thr1)
-        IdExtrap=c(IdExtrap,IdEL)
-        TypeE=c(TypeE,rep(1,length(IdEL)))
-        IdC2=rbind(IdC2,IdLC)
-        
-            
-        }else{
-          #case 2B2: all validations concerns errors
-          #id are extrapolated on time only
-          Vtemp=Vtemp[order(Vtemp$TimeNum),]
-          cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
-          CorrV=findInterval(IdSp$TimeNum, cuts)
-          IdE=Vtemp$IdV[CorrV]
-          IdExtrap=c(IdExtrap,IdE)
-          TypeE=c(TypeE,rep(1,length(IdE)))
-          IdC2=rbind(IdC2,IdSp)
-          }
-      }
-    
-    
-  }
-  
-  
-}
-test1=(nrow(IdC2)==length(IdExtrap))
-test2=(nrow(IdC2)==nrow(IdCorrect))
-if((test1==F)|(test2==F))
-{
-  (stop("Erreur de traitement !!!"))
-}
-
-IdC2$IdExtrap=IdExtrap
-IdC2$TypeE=TypeE
-
-
-IdC2=IdC2[order(IdC2$IdProb,decreasing=T),]
-IdC2=IdC2[order(IdC2$ConfV,decreasing=T),]
-IdC2=IdC2[order(IdC2$`nom du fichier`),]
-#discard duplicated species within the same files (= false positives corrected by 2nd layer)
-IdC2=unique(IdC2,by=c("nom du fichier","IdExtrap"))
-
-write.table(IdC2,"output.tabular",row.names=F,sep="\t",quote=FALSE,na="NA")
author	ecology
date	Fri, 26 Apr 2019 12:15:41 -0400
parents	aa3afb770148
children