is_relevant <- function(qrels,docid) { docid=toString(docid) fila = subset(qrels, DOC_ID==docid) if (nrow(fila)==0) {return(0)} else { return(fila$REL) } } estimate_prel_at_pos_from_individual_curve<- function(C, s, pos) { prel = C * pos^s -1 return(prel) } get_estimations <- function(rangemin,rangemax, pre_computed_estimated_prel_at_pos) { estimations = rep(NA,nrow(pre_computed_estimated_prel_at_pos)) for (iq in 1:nrow(pre_computed_estimated_prel_at_pos)) estimations[iq] = sum(pre_computed_estimated_prel_at_pos[iq,rangemin:rangemax]) return(estimations) } pre_compute_estimated_prel_at_pos<-function(C_lists, s_lists, maxpoolsize) { table_rdo = matrix(nrow=length(C_lists),ncol=maxpoolsize) for (iq in 1:length(C_lists)) for (pos in 1:maxpoolsize) table_rdo[iq,pos] = estimate_prel_at_pos_from_individual_curve(C_lists[[iq]], s_lists[[iq]], pos) return(table_rdo) } get_future_Fs <- function(nrels_so_far, rangemin,rangemax, pre_computed_estimated_prel_at_pos, ws) { future_Fs = rep(NA,rangemax-rangemin+1) accrels = nrels_so_far ifuture_Fs = 1 for ( pos in rangemin:rangemax ) { estimated_future_rels = rep(NA,nrow(pre_computed_estimated_prel_at_pos)) for (iq in 1:nrow(pre_computed_estimated_prel_at_pos)) estimated_future_rels[iq] = sum(pre_computed_estimated_prel_at_pos[iq,pos:rangemax]) accrels = accrels + sum(ws* pre_computed_estimated_prel_at_pos[,pos] )/sum(ws) nestimated_future_rels = sum(ws*estimated_future_rels)/sum(ws) eP = accrels/pos eR = accrels/(nrels_so_far+nestimated_future_rels) future_Fs[ifuture_Fs] = (2*eP*eR) / (eP+eR) ifuture_Fs = ifuture_Fs + 1 } return(future_Fs) } # perf_code = 0 makes estimation based on p@n, perf_code=1 makes estimation based on norm_acc_prec get_query_weights<-function(n, this_query_p_at_n, this_query_norm_acc_prec_at_n, p_at_n_lists, norm_acc_prec_at_n_lists, perf_code) { weights = rep(NA,length(p_at_n_lists)) if(perf_code==0) { perf = this_query_p_at_n perf_lists = p_at_n_lists } else { perf = this_query_norm_acc_prec_at_n perf_lists = norm_acc_prec_at_n_lists } for (iq in 1:length(perf_lists)) { if (n <= length(perf_lists[[iq]]) ) weights[iq]= 1 - abs( perf - (perf_lists[[iq]])[n] ) else { # this training query does not get to size n, we compute the closeness from # the test query @ n to the training query @ the last available point weights[iq]= 1 - abs( perf - (perf_lists[[iq]])[ length(perf_lists[[iq]]) ] ) } } return(weights) } # i+1,length(judgments1),this_query_p_at_n_list[i], this_query_norm_acc_prec_at_n_list[i], p_at_n_lists, norm_acc_prec_at_n_lists, C_lists, s_lists estimate_future_rels_and_Fs <- function(n, nrels_so_far, rangemin,rangemax,this_query_p_at_n, this_query_norm_acc_prec_at_n, p_at_n_lists, norm_acc_prec_at_n_lists, pre_computed_estimated_prel_at_pos, perf_code) { if (rangemin>rangemax) { future_rels=0 future_Fs=c() return(list("FR"=future_rels,"FF"=future_Fs)) } ws = get_query_weights(n, this_query_p_at_n, this_query_norm_acc_prec_at_n, p_at_n_lists, norm_acc_prec_at_n_lists, perf_code) estimations = get_estimations(rangemin,rangemax, pre_computed_estimated_prel_at_pos) #weighted sum estimated_future_rels = sum(ws*estimations)/sum(ws) future_Fs = get_future_Fs(nrels_so_far, rangemin,rangemax, pre_computed_estimated_prel_at_pos, ws) return(list("FR"=estimated_future_rels,"FF"=future_Fs)) } # judgement_list_path: file created by process_multiple_queries # qrels_path: path to the qrel file # training_curves_and_perfs_file: path to the file produced by stopping_fit_training_queries (fitted curves and performances) # perf_code=0 means that perf is computed as p@n # perf_code=1 means that perf is computed by AvgP # writes the subqrels in subqrel_file_path stopping_stop_if_no_better_expectations <- function(judgement_list_path, qrels_path, training_curves_and_perfs_file, perf_code, subqrel_file_path) { # loads "p_at_n_lists","norm_acc_prec_at_n_lists","C_lists","s_lists" load(training_curves_and_perfs_file) # reads the judgment_list ... gets judgments_lists variable ... load(judgement_list_path) print(paste("Judgement list file...",judgement_list_path,"...",length(judgments_lists)," judgment list loaded.")) # reads the qrel file into an R dataframe with appropriate column names qrels_df= read.table(qrels_path,header=FALSE) names(qrels_df)=c("QUERY","DUMMY","DOC_ID","REL") print(paste("Qrel file...",qrels_path,"...",nrow(qrels_df)," judgments.")) queries= unique(qrels_df$QUERY) poolsizes=rep(0,length(queries)) iq=1 for (q in queries) { #get the judgment list from judgments_lists judgments1 = judgments_lists[[iq]] poolsizes[iq]=length(judgments1) iq = iq + 1 } maxpoolsize=max(poolsizes) iq=1 pre_computed_estimated_prel_at_pos = pre_compute_estimated_prel_at_pos(C_lists, s_lists,maxpoolsize) qrel_lines="" for (q in queries) { #get the judgment list from judgments_lists judgments1 = judgments_lists[[iq]] current_ranking1=data.frame(DOCID=judgments1, REL=rep(NA,length(judgments1))) # get the relevance assessments for the current query current_qrels = subset(qrels_df, QUERY==q) # assign the relevance column for each document in the sequence for (i in 1:length(judgments1)) current_ranking1[i,"REL"]=is_relevant(current_qrels,current_ranking1[i,"DOCID"]) real_total_num_rels = sum(current_ranking1$REL) print(paste("Query...",q,", pool size:", length(judgments1), ". ", sum(current_ranking1$REL)," docs are relevant.",sep="" )) this_query_p_at_n_list=rep(NA, length(judgments1) ) this_query_norm_acc_prec_at_n_list=rep(NA, length(judgments1) ) rels = 0 acc_precs = 0 i=1 while(1) { if (current_ranking1[i,"REL"] == 1) { rels = rels + 1 acc_precs = acc_precs + rels/i qrel_line= paste(q,"0",current_ranking1[i,"DOCID"],"1") } else qrel_line= paste(q,"0",current_ranking1[i,"DOCID"],"0") if (qrel_lines=="") qrel_lines=qrel_line else qrel_lines=paste(qrel_lines,qrel_line,sep="\n") this_query_p_at_n_list[i]=rels/i this_query_norm_acc_prec_at_n_list[i]=acc_precs/i if (rels==0) estimated_F=0 else { list_rdo = estimate_future_rels_and_Fs(i, rels, i+1,length(judgments1),this_query_p_at_n_list[i], this_query_norm_acc_prec_at_n_list[i], p_at_n_lists, norm_acc_prec_at_n_lists, pre_computed_estimated_prel_at_pos, perf_code) future_rels = list_rdo$FR estimated_total_num_rels = rels + future_rels P = rels / i eR = rels / estimated_total_num_rels estimated_F= (2*P*eR) / (P+eR) # no more future Fs available -end of the list- if (length(list_rdo$FF)==0) break if (estimated_F > max(list_rdo$FF)) break } i=i+1 } iq=iq+1 } # for q in queries write(qrel_lines,subqrel_file_path) }