args <- commandArgs(trailingOnly = TRUE)


summaryfile = args[1]
sequencesfile = args[2]
mutationanalysisfile = args[3]
mutationstatsfile = args[4]
hotspotsfile = args[5]
gene_identification_file= args[6]
output = args[7]
before.unique.file = args[8]
unmatchedfile = args[9]
method=args[10]
functionality=args[11]
unique_type=args[12]
filter_unique=args[13]
class_filter=args[14]

summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")

if(method == "blastn"){
	"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
	gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
	ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
	gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
	gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
	gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
	colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
	
}

print(paste("Number of sequences in summary file:", nrow(summ)))

summ = merge(summ, gene_identification, by="Sequence.ID")

summ = summ[summ$Functionality != "No results",]

print(paste("Number of sequences after merging with annotation:", nrow(summ)))

if(functionality == "productive"){
	summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
} else if (functionality == "unproductive"){
	summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]
} else if (functionality == "remove_unknown"){
	summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
}

print(paste("Number of sequences after productive filter:", nrow(summ)))

splt = strsplit(class_filter, "_")[[1]]
chunk_hit_threshold = as.numeric(splt[1])
nt_hit_threshold = as.numeric(splt[2])

higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)

unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]

if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
	unmatched = summ[!higher_than,]
	unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
	unmatched$best_match = paste("unmatched,", unmatched$best_match)
	summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
}

if(any(higher_than, na.rm=T)){
	#summ = summ[higher_than,]
}
print(paste("Number of matched sequences:", sum(!grepl("^unmatched", summ$best_match))))

if(nrow(summ) == 0){
	stop("No data remaining after filter")
}

result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")

print(paste("Number of sequences after merging with mutation analysis:", nrow(result)))

result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")

print(paste("Number of sequences after merging with mutation stats:", nrow(result)))

result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")

print(paste("Number of sequences after merging with hotspots:", nrow(result)))

#result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
if(unique_type == "AA.JUNCTION_V_subclass"){
	result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match)
} else if (unique_type == "AA.JUNCTION_subclass"){
	result$past = paste(result$AA.JUNCTION, result$best_match)
} else if (unique_type == "V_subclass"){
	result$past = paste(result$VGene, result$best_match)
} else if (unique_type == "AA.JUNCTION_V"){
	result$past = paste(result$AA.JUNCTION, result$VGene)
} else if (unique_type == "AA.JUNCTION"){
	result$past = paste(result$AA.JUNCTION)
} else {
	result$past = 1:nrow(result)
}

result = result[!(duplicated(result$past)), ]

result = result[,!(names(result) %in% c("past"))]

print(paste("Number of sequences in result after", unique_type, "filtering:", nrow(result)))

#remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions.
sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
result = merge(result, sequences, by="Sequence.ID", all.x=T)

print(paste("Number of sequences in result after merging with sequences:", nrow(result)))

result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]

print(paste("Number of sequences in result after n filtering:", nrow(result)))

cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
                    "CDR1.IMGT.Nb.of.mutations", 
                    "FR2.IMGT.Nb.of.mutations", 
                    "CDR2.IMGT.Nb.of.mutations", 
                    "FR3.IMGT.Nb.of.mutations")

for(col in cleanup_columns){
  result[,col] = gsub("\\(.*\\)", "", result[,col])
  result[,col] = as.numeric(result[,col])
  result[is.na(result[,col]),] = 0
}

result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
result$VGene = gsub("[*].*", "", result$VGene)
result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
result$JGene = gsub("[*].*", "", result$JGene)

write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)

if(filter_unique != "no"){
	#clmns = names(result)
	
	if(grepl("_c", filter_unique)){
		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match)
	} else {
		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
	}

	#fltr = result$unique.def %in% result.filtered$unique.def
		
	if(grepl("keep", filter_unique)){
		result = result[!duplicated(result$unique.def),]
	} else {
		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
		result = result[!duplicated(result$unique.def),]
	}
	
	#result = result[,clmns]
	
	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
}

print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))

print(paste("Number of rows in result:", nrow(result)))
print(paste("Number of rows in unmatched:", nrow(unmatched)))

write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
