为了提高距离比较操作的效率,一种自然的想法是对对象执行随机投影,在更少的维度上计算距离,从而降低高维度的计算消耗 ( DCOs 是一个时间复杂度 O(D) 的操作,D 是向量维度)。具体来说,是将一个高维向量 x 与一个 d 行 D 列的随机正交矩阵相乘 (d < D 才能取得降维的效果),然后基于得到的低维向量计算近似距离。
/* The file is the core of the ADSampling algorithm. We have included detailed comments in the function dist_comp. Note that in the whole algorithm we do not calculate the square root of the distances. */
unsignedint D = 960; // The dimensionality of the dataset. float epsilon0 = 2.1; // epsilon0 - by default 2.1, recommended in [1.0,4.0], valid in in [0, +\infty) unsignedint delta_d = 32; // dimension sampling for every delta_d dimensions.
// The hypothesis testing checks whether \sqrt{D/d} dis' > (1 + epsilon0 / \sqrt{d}) * r. // We equivalently check whether dis' > \sqrt{d/D} * (1 + epsilon0 / \sqrt{d}) * r. inlinefloatratio(constint &D, constint &i){ if(i == D)return1.0; return1.0 * i / D * (1.0 + epsilon0 / std::sqrt(i)) * (1.0 + epsilon0 / std::sqrt(i)); }
/* float dist_comp(const float&, const void *, const void *, float, int) is a generic function for DCOs. When D, epsilon_0 and delta_d can be pre-determined, it is highly suggested to define them as constexpr and provide dataset-specific functions. */ floatdist_comp(constfloat& dis, constvoid *data, constvoid *query, float res = 0, int i = 0){ // If the algorithm starts a non-zero dimensionality (i.e., the case of IVF++), we conduct the hypothesis testing immediately. if(i && res >= dis * ratio(D, i)){ return -res * D / i; } float * q = (float *) query; float * d = (float *) data; while(i < D){ // It continues to sample additional delta_d dimensions. int check = std::min(delta_d, D-i); i += check; for(int j = 1;j<=check;j++){ float t = *d - *q; d ++; q ++; res += t * t; } // Hypothesis tesing if(res >= dis * ratio(D, i)){
// If the null hypothesis is reject, we return the approximate distance. // We return -dis' to indicate that it's a negative object. return -res * D / i; } }
// We return the exact distance when we have sampled all the dimensions. return res; }
};
floatsqr_dist(float* a, float* b, int D){ float ret = 0; for(int i=0;i!=D;i++){ float tmp = (*a - *b); ret += tmp * tmp; a++; b++; } return ret; }
// answers - the KNN set R1 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>> answers; // top_candidates - the result set R2 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>> top_candidates; // candidate_set - the search set S std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set; dist_t lowerBound; dist_t lowerBoundcan; // Insert the entry point to the result and search set with its exact distance as a key. if (!has_deletions || !isMarkedDeleted(ep_id)) {
visited_array[ep_id] = visited_array_tag; int cnt_visit = 0; // Iteratively generate candidates. while (!candidate_set.empty()) { std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
// When the smallest object in S has its distance larger than the largest in R2, terminate the algorithm. if ((-current_node_pair.first) > top_candidates.top().first && (top_candidates.size() == ef || has_deletions == false)) { break; } candidate_set.pop();
// Fetch the smallest object in S. tableint current_node_id = current_node_pair.second; int *data = (int *) get_linklist0(current_node_id); size_t size = getListCount((linklistsizeint*)data); if(collect_metrics){ metric_hops++; metric_distance_computations+=size; }
// Enumerate all the neighbors of the object and view them as candidates of KNNs. for (size_t j = 1; j <= size; j++) { int candidate_id = *(data + j); if (!(visited_array[candidate_id] == visited_array_tag)) { cnt_visit ++; visited_array[candidate_id] = visited_array_tag;
// If the KNN set is not full, then calculate the exact distance. (i.e., assume the distance threshold to be infinity) if (answers.size() < k){ char *currObj1 = (getDataByInternalId(candidate_id)); dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_); adsampling::tot_full_dist ++; if (!has_deletions || !isMarkedDeleted(candidate_id)){ candidate_set.emplace(-dist, candidate_id); top_candidates.emplace(dist, candidate_id); answers.emplace(dist, candidate_id); } if (!answers.empty()) lowerBound = answers.top().first; if (!top_candidates.empty()) lowerBoundcan = top_candidates.top().first; } // Otherwise, conduct DCO with ADSampling wrt the Kth NN. else { char *currObj1 = (getDataByInternalId(candidate_id)); dist_t dist = adsampling::dist_comp(lowerBound, currObj1, data_point, 0, 0); // If it's a positive object, then include it in R1, R2, S. if(dist >= 0){ candidate_set.emplace(-dist, candidate_id); if(!has_deletions || !isMarkedDeleted(candidate_id)){ top_candidates.emplace(dist, candidate_id); answers.emplace(dist, candidate_id); } if(top_candidates.size() > ef) top_candidates.pop(); if(answers.size() > k) answers.pop();
if (!answers.empty()) lowerBound = answers.top().first; if (!top_candidates.empty()) lowerBoundcan = top_candidates.top().first; } // If it's a negative object, then update R2, S with the approximate distance. else{ if(top_candidates.size() < ef || lowerBoundcan > -dist){ top_candidates.emplace(-dist, candidate_id); candidate_set.emplace(dist, candidate_id); } if(top_candidates.size() > ef){ top_candidates.pop(); } if (!top_candidates.empty()) lowerBoundcan = top_candidates.top().first; } } } } } adsampling::tot_dist_calculation += cnt_visit; visited_list_pool_->releaseVisitedList(vl); return answers; }