saridormi commited on
Commit
a70fbd7
β€’
1 Parent(s): 0646a3e

Fix the first submission for the task case and tidy up code a bit

Browse files
Files changed (1) hide show
  1. src/submission_uploader.py +33 -38
src/submission_uploader.py CHANGED
@@ -3,7 +3,7 @@ import logging
3
  import os
4
  import time
5
  from tempfile import TemporaryDirectory
6
- from typing import Dict, List, Optional
7
 
8
  import jsonlines
9
  from huggingface_hub import CommitOperationAdd # type: ignore[import]
@@ -30,13 +30,13 @@ class SubmissionUploader:
30
  def __init__(self, dataset_id: str, private_dataset_id: str):
31
  self._api = HfApi(token=os.environ["HF_TOKEN"])
32
  self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
33
- self._dataset_id = dataset_id
34
- self._private_dataset_id = private_dataset_id
35
 
36
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
37
- """Searches among discussions of dataset repo for a PR with the given title."""
38
  try:
39
- discussions = self._api.get_repo_discussions(repo_id=self._dataset_id, repo_type="dataset")
40
  except Exception:
41
  return None
42
  for discussion in discussions:
@@ -44,22 +44,6 @@ class SubmissionUploader:
44
  return discussion
45
  return None
46
 
47
- def _get_metadata(
48
- self,
49
- model_name_pretty: str,
50
- model_availability: str,
51
- urls: Optional[str],
52
- context_size: str,
53
- submitted_by: str,
54
- ) -> Dict[str, Optional[str]]:
55
- return {
56
- "model_name": model_name_pretty,
57
- "model_availability": model_availability,
58
- "urls": urls,
59
- "context_size": context_size,
60
- "submitted_by": submitted_by,
61
- }
62
-
63
  def _upload_request(
64
  self,
65
  task_id: str,
@@ -74,6 +58,7 @@ class SubmissionUploader:
74
  pr_url: str,
75
  temp_directory: str,
76
  ) -> List[CommitOperationAdd]:
 
77
  request_metadata = {
78
  "model_folder": model_folder,
79
  "model_name_pretty": model_name_pretty,
@@ -90,7 +75,11 @@ class SubmissionUploader:
90
  with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
91
  json.dump(request_metadata, f)
92
 
93
- num_requests_already_present = len(self._fs.ls(f"datasets/{self._private_dataset_id}/{task_id}/"))
 
 
 
 
94
  commit_operations = [
95
  CommitOperationAdd(
96
  path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
@@ -105,6 +94,7 @@ class SubmissionUploader:
105
  model_folder: str,
106
  filenames: List[str],
107
  ) -> List[CommitOperationAdd]:
 
108
  commit_operations = [
109
  CommitOperationAdd(
110
  path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
@@ -115,6 +105,7 @@ class SubmissionUploader:
115
  return commit_operations
116
 
117
  def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
 
118
  metrics_module = METRICS[task_id]
119
  assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
120
  metrics_module.reset()
@@ -153,18 +144,20 @@ class SubmissionUploader:
153
  submitted_by: str,
154
  temp_directory: str,
155
  ) -> List[CommitOperationAdd]:
 
156
  final_results = {}
157
  with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
158
  metrics = json.load(f)
159
  final_results.update(metrics)
160
- metadata_dict = self._get_metadata(
161
- model_name_pretty=model_name_pretty,
162
- model_availability=model_availability,
163
- urls=urls,
164
- context_size=context_size,
165
- submitted_by=submitted_by,
 
 
166
  )
167
- final_results.update(metadata_dict)
168
 
169
  with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
170
  writer.write(final_results)
@@ -189,6 +182,7 @@ class SubmissionUploader:
189
  comment: Optional[str],
190
  filenames: Optional[List[str]],
191
  ):
 
192
  assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
193
  assert model_folder, "Please, specify non-empty name for a directory with a model's results."
194
  assert model_name_pretty, "Please, specify non-empty name for a model."
@@ -238,15 +232,17 @@ class SubmissionUploader:
238
 
239
  logging.info("Checking if this request has already been submitted...")
240
  if not force:
241
- if model_folder in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions"):
242
  return styled_warning(
243
- f"{model_folder} is already present in {self._dataset_id}, please, select another folder name."
244
  )
245
 
246
  prev_pr = self._get_previous_pr(pr_title)
247
  if prev_pr is not None:
248
- url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
249
- return styled_warning(f"{self._dataset_id} already has an open PR for this submission: {url}.")
 
 
250
 
251
  logging.info("Processing predictions...")
252
  predictions_commit_operations = self._upload_predictions(
@@ -271,9 +267,9 @@ class SubmissionUploader:
271
  temp_directory=str(d),
272
  )
273
 
274
- logging.info(f"Creating commit to results dataset...")
275
  new_pr = self._api.create_commit(
276
- repo_id=self._dataset_id,
277
  operations=predictions_commit_operations + results_commit_operations,
278
  commit_message=pr_title,
279
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
@@ -281,7 +277,7 @@ class SubmissionUploader:
281
  repo_type="dataset",
282
  )
283
 
284
- logging.info(f"Creating commit to requests dataset...")
285
  request_commit_operations = self._upload_request(
286
  task_id=task_id,
287
  model_folder=model_folder,
@@ -296,7 +292,7 @@ class SubmissionUploader:
296
  pr_url=new_pr.pr_url,
297
  )
298
  self._api.create_commit(
299
- repo_id=self._private_dataset_id,
300
  operations=request_commit_operations,
301
  commit_message=pr_title,
302
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
@@ -307,7 +303,6 @@ class SubmissionUploader:
307
  return styled_message(f"πŸŽ‰ PR created at {new_pr.pr_url}.")
308
 
309
  except Exception as e:
310
- logging.exception(e)
311
  exception_msg = str(e)
312
  if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
313
  exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")
 
3
  import os
4
  import time
5
  from tempfile import TemporaryDirectory
6
+ from typing import List, Optional
7
 
8
  import jsonlines
9
  from huggingface_hub import CommitOperationAdd # type: ignore[import]
 
30
  def __init__(self, dataset_id: str, private_dataset_id: str):
31
  self._api = HfApi(token=os.environ["HF_TOKEN"])
32
  self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
33
+ self._results_dataset_id = dataset_id
34
+ self._requests_dataset_id = private_dataset_id
35
 
36
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
37
+ """Searches among discussions of the results dataset for a PR with the given title."""
38
  try:
39
+ discussions = self._api.get_repo_discussions(repo_id=self._results_dataset_id, repo_type="dataset")
40
  except Exception:
41
  return None
42
  for discussion in discussions:
 
44
  return discussion
45
  return None
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def _upload_request(
48
  self,
49
  task_id: str,
 
58
  pr_url: str,
59
  temp_directory: str,
60
  ) -> List[CommitOperationAdd]:
61
+ """Adds a file with metadata about the current request to the requests dataset."""
62
  request_metadata = {
63
  "model_folder": model_folder,
64
  "model_name_pretty": model_name_pretty,
 
75
  with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
76
  json.dump(request_metadata, f)
77
 
78
+ num_requests_already_present = (
79
+ len(self._fs.ls(f"datasets/{self._requests_dataset_id}/{task_id}/"))
80
+ if self._fs.isdir(f"datasets/{self._requests_dataset_id}/{task_id}/")
81
+ else 0
82
+ )
83
  commit_operations = [
84
  CommitOperationAdd(
85
  path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
 
94
  model_folder: str,
95
  filenames: List[str],
96
  ) -> List[CommitOperationAdd]:
97
+ """Adds all files with current model's predictions to the results dataset."""
98
  commit_operations = [
99
  CommitOperationAdd(
100
  path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
 
105
  return commit_operations
106
 
107
  def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
108
+ """Computes metrics for each submitted file with the current model's predictions."""
109
  metrics_module = METRICS[task_id]
110
  assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
111
  metrics_module.reset()
 
144
  submitted_by: str,
145
  temp_directory: str,
146
  ) -> List[CommitOperationAdd]:
147
+ """Adds files with the current model's metrics values to the results dataset."""
148
  final_results = {}
149
  with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
150
  metrics = json.load(f)
151
  final_results.update(metrics)
152
+ final_results.update(
153
+ {
154
+ "model_name": model_name_pretty,
155
+ "model_availability": model_availability,
156
+ "urls": urls,
157
+ "context_size": context_size,
158
+ "submitted_by": submitted_by,
159
+ }
160
  )
 
161
 
162
  with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
163
  writer.write(final_results)
 
182
  comment: Optional[str],
183
  filenames: Optional[List[str]],
184
  ):
185
+ """Verifies that all necessary arguments are not None (and also runs other sanity checks)."""
186
  assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
187
  assert model_folder, "Please, specify non-empty name for a directory with a model's results."
188
  assert model_name_pretty, "Please, specify non-empty name for a model."
 
232
 
233
  logging.info("Checking if this request has already been submitted...")
234
  if not force:
235
+ if self._fs.isdir(f"datasets/{self._results_dataset_id}/{task_id}/predictions/{model_folder}"):
236
  return styled_warning(
237
+ f"{model_folder} is already present in {self._results_dataset_id}, please, select another folder name."
238
  )
239
 
240
  prev_pr = self._get_previous_pr(pr_title)
241
  if prev_pr is not None:
242
+ url = f"https://huggingface.co/datasets/{self._results_dataset_id}/discussions/{prev_pr.num}"
243
+ return styled_warning(
244
+ f"{self._results_dataset_id} already has an open PR for this submission: {url}."
245
+ )
246
 
247
  logging.info("Processing predictions...")
248
  predictions_commit_operations = self._upload_predictions(
 
267
  temp_directory=str(d),
268
  )
269
 
270
+ logging.info("Creating commit to the results dataset...")
271
  new_pr = self._api.create_commit(
272
+ repo_id=self._results_dataset_id,
273
  operations=predictions_commit_operations + results_commit_operations,
274
  commit_message=pr_title,
275
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
 
277
  repo_type="dataset",
278
  )
279
 
280
+ logging.info("Creating commit to the requests dataset...")
281
  request_commit_operations = self._upload_request(
282
  task_id=task_id,
283
  model_folder=model_folder,
 
292
  pr_url=new_pr.pr_url,
293
  )
294
  self._api.create_commit(
295
+ repo_id=self._requests_dataset_id,
296
  operations=request_commit_operations,
297
  commit_message=pr_title,
298
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
 
303
  return styled_message(f"πŸŽ‰ PR created at {new_pr.pr_url}.")
304
 
305
  except Exception as e:
 
306
  exception_msg = str(e)
307
  if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
308
  exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")