utility/plot.py line 64 bug
MilesQLi opened this issue · 5 comments
I see some commented code there. This line doesn't work.
Thanks for raising the issue. Can you provide a bit more detail on what doesn't work? What script/command are you running, does this lead to an error, what error? Or is the output unexpected?
Thanks!
It happens when I run the Fine-tuning code on README.md. The line that triggers the error is
_evaluate_model(model_w_sim, model_path, [test_path], test_stel=False, test_AV=True)
test_AV should be set True.
The error log is as follows:
ValueError Traceback (most recent call last)
Input In [1], in <cell line: 23>()
20 model_path = tuner.train(epochs=1, batch_size=128)
22 model_w_sim = TunedSentenceBertSimilarity(model=tuner.model)
---> 23 _evaluate_model(model_w_sim, model_path, [test_path], test_stel=False, test_AV=True)
File ~\repos\Style-Embeddings-master\src\style_embed\eval_model.py:61, in _evaluate_model(model, model_path, test_files, test_stel, test_AV)
59 for test_file in test_files:
60 logging.info("testing on {} ...".format(test_file))
---> 61 triple_test_sim_function(similarity_function_callable=model.similarities,
62 triple_task_filename=test_file, output_folder=results_folder,
63 sim_function_name=model_name)
File Style-Embeddings\src\style_embed..\style_embed/utility\evaluation_metrics.py:176, in triple_test_sim_function(similarity_function_callable, triple_task_filename, print_top_n, output_folder, sim_function_name, model_prefix)
172 plot_sims_filebase = f"{output_folder}/02_sims_ACC-{result_dict['acc_score']}{time.time()}"
174 logging.info('Going to save plots to {} and{}'.format(plot_diff_filebase, plot_sims_filebase))
--> 176 diff_values = plot_from_resultdict(plot_diff_filebase, plot_sims_filebase, result_dict)
178 # SAVING single predictions
179 pred_save_path = f"{output_folder}/03_T-AV_si-p{time.time()}.tsv" #{file_base}_
File Style-Embeddings\src\style_embed..\style_embed/utility\evaluation_metrics.py:213, in plot_from_resultdict(plot_diff_filebase, plot_sims_filebase, result_dict)
210 # plot_diff_values(diff_values, result_dict["triple_pred"], result_dict["val_class"], plot_diff_filebase)
211 # PLOT similarity values, i.e., same author values should be greater than distinct
212 logging.info('plotting sim values ...')
--> 213 plot_sim_values(result_dict["same_sims"], result_dict["distinct_sims"], plot_sims_filebase)
214 return diff_values
File Style-Embeddings\src\style_embed..\style_embed/utility\plot.py:64, in plot_sim_values(gtp_sim_val, gtn_sim_val, plot_filebase, size, median)
61 dfd = pd.DataFrame({'Distinct Author': np.ones(len(gtn_sim_val)),
62 'Similarity Value': tensorarray_to_array(gtn_sim_val)})
63 df = pd.concat([dfs, dfd]) # , ignore_index=True) # ignore_index=True
---> 64 graph = sns.displot(df, x="Similarity Value", hue="Distinct Author", kind="kde", fill=True) # , cut=0) sns.kdeplot(df, x="Similarity Value", shade=False, color='crimson') #
65 # graph = sns.kdeplot(data=df, x="Similarity Value", shade=False) #
66 means = dfs['Similarity Value'].mean()
File pathto\lib\site-packages\seaborn\distributions.py:2299, in displot(data, x, y, hue, row, col, weights, kind, rug, rug_kws, log_scale, legend, palette, hue_order, hue_norm, color, col_wrap, row_order, col_order, height, aspect, facet_kws, **kwargs)
2296 if p.univariate:
2298 _assign_default_kwargs(kde_kws, p.plot_univariate_density, kdeplot)
-> 2299 p.plot_univariate_density(**kde_kws)
2301 else:
2303 _assign_default_kwargs(kde_kws, p.plot_bivariate_density, kdeplot)
File pathto\lib\site-packages\seaborn\distributions.py:928, in _DistributionPlotter.plot_univariate_density(self, multiple, common_norm, common_grid, warn_singular, fill, legend, estimate_kws, **plot_kws)
925 log_scale = self._log_scaled(self.data_variable)
927 # Do the computation
--> 928 densities = self._compute_univariate_density(
929 self.data_variable,
930 common_norm,
931 common_grid,
932 estimate_kws,
933 log_scale,
934 warn_singular,
935 )
937 # Adjust densities based on the multiple
rule
938 densities, baselines = self._resolve_multiple(densities, multiple)
File pathto\lib\site-packages\seaborn\distributions.py:303, in _DistributionPlotter._compute_univariate_density(self, data_variable, common_norm, common_grid, estimate_kws, log_scale, warn_singular)
299 common_norm = False
301 densities = {}
--> 303 for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
304
305 # Extract the data points from this sub set and remove nulls
306 sub_data = sub_data.dropna()
307 observations = sub_data[data_variable]
File pathto\lib\site-packages\seaborn_core.py:983, in VectorPlotter.iter_data(self, grouping_vars, reverse, from_comp_data)
978 grouping_vars = [
979 var for var in grouping_vars if var in self.variables
980 ]
982 if from_comp_data:
--> 983 data = self.comp_data
984 else:
985 data = self.plot_data
File pathto\lib\site-packages\seaborn_core.py:1057, in VectorPlotter.comp_data(self)
1055 orig = self.plot_data[var].dropna()
1056 comp_col = pd.Series(index=orig.index, dtype=float, name=var)
-> 1057 comp_col.loc[orig.index] = pd.to_numeric(axis.convert_units(orig))
1059 if axis.get_scale() == "log":
1060 comp_col = np.log10(comp_col)
File pathto\lib\site-packages\pandas\core\indexing.py:716, in _LocationIndexer.setitem(self, key, value)
713 self._has_valid_setitem_indexer(key)
715 iloc = self if self.name == "iloc" else self.obj.iloc
--> 716 iloc._setitem_with_indexer(indexer, value, self.name)
File pathto\lib\site-packages\pandas\core\indexing.py:1690, in _iLocIndexer._setitem_with_indexer(self, indexer, value, name)
1688 self._setitem_with_indexer_split_path(indexer, value, name)
1689 else:
-> 1690 self._setitem_single_block(indexer, value, name)
File pathto\lib\site-packages\pandas\core\indexing.py:1929, in _iLocIndexer._setitem_single_block(self, indexer, value, name)
1923 indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align
1925 if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
1926 # TODO(EA): ExtensionBlock.setitem this causes issues with
1927 # setting for extensionarrays that store dicts. Need to decide
1928 # if it's worth supporting that.
-> 1929 value = self._align_series(indexer, Series(value))
1931 elif isinstance(value, ABCDataFrame) and name != "iloc":
1932 value = self._align_frame(indexer, value)
File pathto\lib\site-packages\pandas\core\indexing.py:2091, in _iLocIndexer._align_series(self, indexer, ser, multiindex_indexer)
2089 if obj.ndim == 2 and is_empty_indexer(indexer[0], ser._values):
2090 return ser._values.copy()
-> 2091 ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values
2093 # single indexer
2094 if len(indexer) > 1 and not multiindex_indexer:
File pathto\lib\site-packages\pandas\core\series.py:4672, in Series.reindex(self, *args, **kwargs)
4668 raise TypeError(
4669 "'index' passed as both positional and keyword argument"
4670 )
4671 kwargs.update({"index": index})
-> 4672 return super().reindex(**kwargs)
File pathto\lib\site-packages\pandas\core\generic.py:4966, in NDFrame.reindex(self, *args, **kwargs)
4963 return self._reindex_multi(axes, copy, fill_value)
4965 # perform the reindex on the axes
-> 4966 return self._reindex_axes(
4967 axes, level, limit, tolerance, method, fill_value, copy
4968 ).finalize(self, method="reindex")
File pathto\lib\site-packages\pandas\core\generic.py:4986, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4981 new_index, indexer = ax.reindex(
4982 labels, level=level, limit=limit, tolerance=tolerance, method=method
4983 )
4985 axis = self._get_axis_number(a)
-> 4986 obj = obj._reindex_with_indexers(
4987 {axis: [new_index, indexer]},
4988 fill_value=fill_value,
4989 copy=copy,
4990 allow_dups=False,
4991 )
4992 # If we've made a copy once, no need to make another one
4993 copy = False
File pathto\lib\site-packages\pandas\core\generic.py:5032, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5029 indexer = ensure_platform_int(indexer)
5031 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
-> 5032 new_data = new_data.reindex_indexer(
5033 index,
5034 indexer,
5035 axis=baxis,
5036 fill_value=fill_value,
5037 allow_dups=allow_dups,
5038 copy=copy,
5039 )
5040 # If we've made a copy once, no need to make another one
5041 copy = False
File pathto\lib\site-packages\pandas\core\internals\managers.py:679, in BaseBlockManager.reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice, use_na_proxy)
677 # some axes don't allow reindexing with dups
678 if not allow_dups:
--> 679 self.axes[axis]._validate_can_reindex(indexer)
681 if axis >= self.ndim:
682 raise IndexError("Requested axis not found in manager")
File pathto\lib\site-packages\pandas\core\indexes\base.py:4107, in Index._validate_can_reindex(self, indexer)
4105 # trying to reindex on an axis with duplicates
4106 if not self._index_as_unique and len(indexer):
-> 4107 raise ValueError("cannot reindex on an axis with duplicate labels")
ValueError: cannot reindex on an axis with duplicate labels
Thanks! In my first try I couldn't reproduce the error, but I will try a few more things in two weeks (vacation time next week).
Are you evaluating on your own or our provided test dataset? The seaborn version used here is 0.11.0
As a short-term workaround you could remove the plotting (you will need to also comment out few other lines of code later on but they shouldn't be the most interesting eval results anyway).
My seaborn version is 0.11.2. I think the reason is that with
df = pd.concat([dfs, dfd]) # , ignore_index=True)
you combine similarity values for both positive and negative samples. Then I checked the pandas dataframe, the index of the samples from "dfd" starts from 0 again. For example, if both "dfs" and "dfd" have 3 samples, the index of the combined dataframe is 0,1,2,0,1,2. But if we add back ignore_index=True, the index is 0,1,2,3,4,5 and it works. I think this could solve the problem. But I don't know why it works for you. Do you remember why you commented out ignore_index=True
?
Interesting question. And good observation. I took a closer look at the code and the GIT history in my private repo and the short answer is that I have no idea why I am even concatenating dataframes instead of just creating the dataframe I want to use directly. I changed the code. It should work now. Reopen if it doesn't. And thanks for pointing me towards this :)