24 using namespace TableFunctions_Namespace;
26 template <
typename U,
typename K,
typename S>
27 int64_t write_cos_sim(
const std::vector<S>& similarity_vector,
28 const std::vector<U>& key_map,
29 const ColumnMetadata& primary_key_metadata,
32 const uint64_t num_rows = key_map.size();
35 for (U c = 0; c != key_map.size(); ++c) {
36 output_key_col[c] = primary_key_metadata.map_to_uncompressed_range(key_map[c]);
37 output_similarity[c] = similarity_vector[c];
42 template <
typename U,
typename K,
typename S>
43 int64_t write_cos_sim(
const DenseMatrix<S>& similarity_matrix,
44 const std::vector<U>& key_map,
45 const ColumnMetadata& primary_key_metadata,
49 const uint64_t num_rows =
50 similarity_matrix.num_cols * (similarity_matrix.num_cols + 1) / 2;
54 uint64_t output_idx = 0;
55 for (U c = 0; c != similarity_matrix.num_cols; ++c) {
56 const U uncompressed_col_key =
57 primary_key_metadata.map_to_uncompressed_range(key_map[c]);
58 const U max_row = c + 1;
59 for (U r = 0; r != max_row; ++r) {
60 const U uncompressed_row_key =
61 primary_key_metadata.map_to_uncompressed_range(key_map[r]);
62 output_key_col_1[output_idx] = uncompressed_row_key;
63 output_key_col_2[output_idx] = uncompressed_col_key;
64 output_similarity[output_idx] = similarity_matrix.get(r, c);
71 template <
typename K,
typename F,
typename M,
typename U,
typename S>
72 int64_t similarity_vector_impl(
const Column<K>& matrix_primary_key,
75 const CompositeKeyMetadata& matrix_primary_key_metadata,
76 const CompositeKeyMetadata& matrix_pivot_features_metadata,
79 const CompositeKeyMetadata& vector_pivot_features_metadata,
82 const bool normalize_by_idf) {
83 CompositeKeyMetadata unioned_pivot_features_metadata = unionCompositeKeyMetadata(
84 matrix_pivot_features_metadata, vector_pivot_features_metadata);
88 copyCompositeKeyMetadataNulls(unioned_pivot_features_metadata,
89 matrix_pivot_features_metadata);
91 SparseMatrixCsc<U, S> sparse_matrix_csc =
92 pivot_table_to_sparse_csc_matrix<K, F, M, U, S>(matrix_primary_key,
93 matrix_pivot_features,
95 matrix_primary_key_metadata,
96 unioned_pivot_features_metadata);
97 copyCompositeKeyMetadataNulls(unioned_pivot_features_metadata,
98 vector_pivot_features_metadata);
100 SparseVector<U, S> sparse_vector = pivot_table_to_sparse_vector<F, M, U, S>(
101 vector_pivot_features, vector_metric, unioned_pivot_features_metadata);
103 if (normalize_by_idf) {
104 const std::vector<double> idf_vec = idf_normalize(
105 sparse_matrix_csc, static_cast<U>(unioned_pivot_features_metadata.num_keys));
106 const size_t sparse_vec_size = sparse_vector.data.size();
107 for (
size_t r = 0; r < sparse_vec_size; ++r) {
108 sparse_vector.data[r] *= idf_vec[sparse_vector.row_indices[r]];
112 const std::vector<S> similarity_vector =
113 multiply_matrix_by_vector(sparse_matrix_csc, sparse_vector,
true);
115 const int64_t num_rows =
116 write_cos_sim(similarity_vector,
117 sparse_matrix_csc.col_values,
118 matrix_primary_key_metadata.keys_metadata[0].column_metadata,
125 template <
typename K,
typename F,
typename M,
typename U,
typename S>
126 int64_t similarity_impl(
const Column<K>& primary_key,
129 const CompositeKeyMetadata& primary_key_metadata,
130 const CompositeKeyMetadata& pivot_features_metadata,
134 const bool normalize_by_idf) {
135 SparseMatrixCsc<U, S> sparse_matrix_csc =
136 pivot_table_to_sparse_csc_matrix<K, F, M, U, S>(primary_key,
139 primary_key_metadata,
140 pivot_features_metadata);
142 if (normalize_by_idf) {
143 idf_normalize(sparse_matrix_csc, static_cast<U>(pivot_features_metadata.num_keys));
146 const DenseMatrix<S> similarity_matrix =
147 multiply_matrix_by_transpose(sparse_matrix_csc,
true);
149 const int64_t num_rows =
150 write_cos_sim(similarity_matrix,
151 sparse_matrix_csc.col_values,
152 primary_key_metadata.keys_metadata[0].column_metadata,
153 output_primary_key_1,
154 output_primary_key_2,
169 template <
typename K,
typename F,
typename M>
170 int64_t tf_feature_similarity__cpu_template(
176 const bool use_tf_idf,
179 if (pivot_features.
numCols() != comparison_pivot_features.
numCols()) {
180 std::cout <<
"Error: Pivot features must have the same number of keys." << std::endl;
185 const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
186 const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
187 const auto comparison_pivot_features_metadata =
188 getCompositeKeyMetadata(comparison_pivot_features);
191 const uint64_t max_dimension_range =
192 std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
194 if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
195 return similarity_vector_impl<K, F, M, uint64_t, float>(
199 primary_key_metadata,
200 pivot_features_metadata,
201 comparison_pivot_features,
203 comparison_pivot_features_metadata,
209 return similarity_vector_impl<K, F, M, uint32_t, float>(
213 primary_key_metadata,
214 pivot_features_metadata,
215 comparison_pivot_features,
217 comparison_pivot_features_metadata,
233 template <
typename K,
typename M>
234 int64_t tf_feature_similarity__cpu_template(
240 const bool use_tf_idf,
243 if (pivot_features.
numCols() != comparison_pivot_features.
numCols()) {
244 std::cout <<
"Error: Pivot features must have the same number of keys." << std::endl;
249 const int64_t num_feature_cols = pivot_features.
numCols();
250 const int64_t num_comparison_rows = comparison_pivot_features.
size();
251 std::vector<int8_t*> new_col_ptrs;
252 std::vector<StringDictionaryProxy*> new_sdp_ptrs;
253 std::vector<std::vector<int32_t>> translated_col_ids(num_feature_cols);
254 for (int64_t col_idx = 0; col_idx < num_feature_cols; ++col_idx) {
256 const auto& primary_sdp_string_dict_id = primary_sdp->
getDictKey();
258 const auto& comparison_string_dict_id = comparison_sdp->
getDictKey();
259 if (primary_sdp_string_dict_id != comparison_string_dict_id) {
260 const auto translation_map =
261 comparison_sdp->buildIntersectionTranslationMapToOtherProxy(primary_sdp, {});
262 translated_col_ids[col_idx].resize(num_comparison_rows);
263 int32_t* translated_ids = translated_col_ids[col_idx].data();
264 const auto source_col_ptr =
265 reinterpret_cast<const int32_t*
>(comparison_pivot_features.
ptrs_[col_idx]);
266 for (int64_t row_idx = 0; row_idx < num_comparison_rows; ++row_idx) {
267 const auto source_id = source_col_ptr[row_idx];
268 const auto translated_id =
269 source_id != inline_null_value<int32_t>() ? translation_map[source_id] : -1;
270 translated_ids[row_idx] =
271 translated_id == -1 ? inline_null_value<int32_t>() : translated_id;
273 new_col_ptrs.emplace_back(reinterpret_cast<int8_t*>(translated_ids));
274 new_sdp_ptrs.emplace_back(primary_sdp);
276 new_col_ptrs.emplace_back(comparison_pivot_features.
ptrs_[col_idx]);
277 new_sdp_ptrs.emplace_back(comparison_sdp);
281 new_col_ptrs.data(), num_feature_cols, num_comparison_rows, new_sdp_ptrs.data());
283 const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
284 const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
285 const auto comparison_pivot_features_metadata =
286 getCompositeKeyMetadata(translated_comparison_pivot_features);
289 const uint64_t max_dimension_range =
290 std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
292 if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
293 return similarity_vector_impl<K, TextEncodingDict, M, uint64_t, float>(
297 primary_key_metadata,
298 pivot_features_metadata,
299 translated_comparison_pivot_features,
301 comparison_pivot_features_metadata,
307 return similarity_vector_impl<K, TextEncodingDict, M, uint32_t, float>(
311 primary_key_metadata,
312 pivot_features_metadata,
313 translated_comparison_pivot_features,
315 comparison_pivot_features_metadata,
331 template <
typename K,
typename F,
typename M>
332 int64_t tf_feature_self_similarity__cpu_template(
const Column<K>& primary_key,
335 const bool use_tf_idf,
339 const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
340 const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
342 const uint64_t max_dimension_range =
343 std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
344 if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
345 return similarity_impl<K, F, M, uint64_t, float>(primary_key,
348 primary_key_metadata,
349 pivot_features_metadata,
350 output_primary_key_1,
351 output_primary_key_2,
356 return similarity_impl<K, F, M, uint32_t, float>(primary_key,
359 primary_key_metadata,
360 pivot_features_metadata,
361 output_primary_key_1,
362 output_primary_key_2,
368 #endif // #ifdef HAVE_TBB
369 #endif // #ifndef __CUDACC__
EXTENSION_NOINLINE_HOST void set_output_row_size(int64_t num_rows)
DEVICE int64_t numCols() const
DEVICE int64_t size() const
StringDictionaryProxy ** string_dict_proxies_
DEVICE int64_t numCols() const
const shared::StringDictKey & getDictKey() const noexcept