meg-huggingface commited on
Commit
f9936fb
1 Parent(s): deefca3

Update from rollback

Browse files
data_measurements/dataset_statistics.py CHANGED
@@ -303,6 +303,7 @@ class DatasetStatisticsCacheClass:
303
  self.node_list_fid = pjoin(self.cache_path, "node_list.th")
304
  # Needed for UI
305
  self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
 
306
 
307
  self.live = False
308
 
@@ -366,6 +367,7 @@ class DatasetStatisticsCacheClass:
366
  """
367
  # Text length figure
368
  if (self.use_cache and exists(self.fig_tok_length_fid)):
 
369
  self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
370
  else:
371
  if not self.live:
@@ -709,6 +711,8 @@ class DatasetStatisticsCacheClass:
709
  zipf_dict = json.load(f)
710
  self.z = Zipf()
711
  self.z.load(zipf_dict)
 
 
712
  self.zipf_fig = read_plotly(self.zipf_fig_fid)
713
  elif self.use_cache and exists(self.zipf_fid):
714
  # TODO: Read zipf data so that the vocab is there.
@@ -771,26 +775,30 @@ class nPMIStatisticsCacheClass:
771
  and exists(self.npmi_terms_fid)
772
  and json.load(open(self.npmi_terms_fid))["available terms"] != []
773
  ):
774
- available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
775
  else:
776
- true_false = [
777
- term in self.dstats.vocab_counts_df.index for term in self.termlist
778
- ]
779
- word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
780
- true_false_counts = [
781
- self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
782
- for word in word_list_tmp
783
- ]
784
- available_terms = [
785
- word for word, y in zip(word_list_tmp, true_false_counts) if y
786
- ]
787
- logs.info(available_terms)
788
- with open(self.npmi_terms_fid, "w+") as f:
789
- json.dump({"available terms": available_terms}, f)
790
- self.available_terms = available_terms
791
- return available_terms
792
-
793
- def load_or_prepare_joint_npmi(self, subgroup_pair):
 
 
 
 
794
  """
795
  Run on-the fly, while the app is already open,
796
  as it depends on the subgroup terms that the user chooses
@@ -824,12 +832,14 @@ class nPMIStatisticsCacheClass:
824
  joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
825
  subgroup_pair, subgroup_files
826
  )
827
- # Cache new results
828
- logs.info("Writing out.")
829
- for subgroup in subgroup_pair:
830
- write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
831
- with open(joint_npmi_fid, "w+") as f:
832
- joint_npmi_df.to_csv(f)
 
 
833
  else:
834
  joint_npmi_df = pd.DataFrame()
835
  logs.info("The joint npmi df is")
@@ -871,7 +881,7 @@ class nPMIStatisticsCacheClass:
871
  subgroup_dict[subgroup] = cached_results
872
  logs.info("Calculating for subgroup list")
873
  joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
874
- return joint_npmi_df.dropna(), subgroup_dict
875
 
876
  # TODO: Update pairwise assumption
877
  def do_npmi(self, subgroup_pair, subgroup_dict):
@@ -882,6 +892,7 @@ class nPMIStatisticsCacheClass:
882
  :return: Selected identity term's co-occurrence counts with
883
  other words, pmi per word, and nPMI per word.
884
  """
 
885
  logs.info("Initializing npmi class")
886
  npmi_obj = self.set_npmi_obj()
887
  # Canonical ordering used
@@ -889,18 +900,26 @@ class nPMIStatisticsCacheClass:
889
  # Calculating nPMI statistics
890
  for subgroup in subgroup_pair:
891
  # If the subgroup data is already computed, grab it.
892
- # TODO: Should we set idx and column names similarly to how we set them for cached files?
 
893
  if subgroup not in subgroup_dict:
894
  logs.info("Calculating statistics for %s" % subgroup)
895
  vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
896
- # Store the nPMI information for the current subgroups
897
- subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
898
- # Pair the subgroups together, indexed by all words that
899
- # co-occur between them.
900
- logs.info("Computing pairwise npmi bias")
901
- paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
902
- UI_results = make_npmi_fig(paired_results, subgroup_pair)
903
- return UI_results, subgroup_dict
 
 
 
 
 
 
 
904
 
905
  def set_npmi_obj(self):
906
  """
@@ -1291,3 +1310,4 @@ def write_zipf_data(z, zipf_fid):
1291
  zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
1292
  with open(zipf_fid, "w+", encoding="utf-8") as f:
1293
  json.dump(zipf_dict, f)
 
 
303
  self.node_list_fid = pjoin(self.cache_path, "node_list.th")
304
  # Needed for UI
305
  self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
306
+ self.zipf_counts = None
307
 
308
  self.live = False
309
 
 
367
  """
368
  # Text length figure
369
  if (self.use_cache and exists(self.fig_tok_length_fid)):
370
+ self.fig_tok_length_png = mpimg.imread(self.fig_tok_length_fid)
371
  self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
372
  else:
373
  if not self.live:
 
711
  zipf_dict = json.load(f)
712
  self.z = Zipf()
713
  self.z.load(zipf_dict)
714
+ # TODO: Should this be cached?
715
+ self.zipf_counts = self.z.calc_zipf_counts(self.vocab_counts_df)
716
  self.zipf_fig = read_plotly(self.zipf_fig_fid)
717
  elif self.use_cache and exists(self.zipf_fid):
718
  # TODO: Read zipf data so that the vocab is there.
 
775
  and exists(self.npmi_terms_fid)
776
  and json.load(open(self.npmi_terms_fid))["available terms"] != []
777
  ):
778
+ self.available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
779
  else:
780
+ if not self.live:
781
+ if self.dstats.vocab_counts_df is None:
782
+ self.dstats.load_or_prepare_vocab()
783
+
784
+ true_false = [
785
+ term in self.dstats.vocab_counts_df.index for term in self.termlist
786
+ ]
787
+ word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
788
+ true_false_counts = [
789
+ self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
790
+ for word in word_list_tmp
791
+ ]
792
+ available_terms = [
793
+ word for word, y in zip(word_list_tmp, true_false_counts) if y
794
+ ]
795
+ logs.info(available_terms)
796
+ with open(self.npmi_terms_fid, "w+") as f:
797
+ json.dump({"available terms": available_terms}, f)
798
+ self.available_terms = available_terms
799
+ return self.available_terms
800
+
801
+ def load_or_prepare_joint_npmi(self, subgroup_pair, save=True):
802
  """
803
  Run on-the fly, while the app is already open,
804
  as it depends on the subgroup terms that the user chooses
 
832
  joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
833
  subgroup_pair, subgroup_files
834
  )
835
+ if save:
836
+ if joint_npmi_df is not None:
837
+ # Cache new results
838
+ logs.info("Writing out.")
839
+ for subgroup in subgroup_pair:
840
+ write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
841
+ with open(joint_npmi_fid, "w+") as f:
842
+ joint_npmi_df.to_csv(f)
843
  else:
844
  joint_npmi_df = pd.DataFrame()
845
  logs.info("The joint npmi df is")
 
881
  subgroup_dict[subgroup] = cached_results
882
  logs.info("Calculating for subgroup list")
883
  joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
884
+ return joint_npmi_df, subgroup_dict
885
 
886
  # TODO: Update pairwise assumption
887
  def do_npmi(self, subgroup_pair, subgroup_dict):
 
892
  :return: Selected identity term's co-occurrence counts with
893
  other words, pmi per word, and nPMI per word.
894
  """
895
+ no_results = False
896
  logs.info("Initializing npmi class")
897
  npmi_obj = self.set_npmi_obj()
898
  # Canonical ordering used
 
900
  # Calculating nPMI statistics
901
  for subgroup in subgroup_pair:
902
  # If the subgroup data is already computed, grab it.
903
+ # TODO: Should we set idx and column names similarly to
904
+ # how we set them for cached files?
905
  if subgroup not in subgroup_dict:
906
  logs.info("Calculating statistics for %s" % subgroup)
907
  vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
908
+ if vocab_cooc_df is None:
909
+ no_results = True
910
+ else:
911
+ # Store the nPMI information for the current subgroups
912
+ subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
913
+ if no_results:
914
+ logs.warning("Couldn't grap the npmi files -- Under construction")
915
+ return None, None
916
+ else:
917
+ # Pair the subgroups together, indexed by all words that
918
+ # co-occur between them.
919
+ logs.info("Computing pairwise npmi bias")
920
+ paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
921
+ UI_results = make_npmi_fig(paired_results, subgroup_pair)
922
+ return UI_results.dropna(), subgroup_dict
923
 
924
  def set_npmi_obj(self):
925
  """
 
1310
  zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
1311
  with open(zipf_fid, "w+", encoding="utf-8") as f:
1312
  json.dump(zipf_dict, f)
1313
+