martinjosifoski commited on
Commit
3c7fd6a
1 Parent(s): 716d007

Add Codeforces Flows.

Browse files
.gitignore ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### macOS ###
2
+ # General
3
+ .DS_Store
4
+ .AppleDouble
5
+ .LSOverride
6
+
7
+ # Icon must end with two \r
8
+ Icon
9
+
10
+ # Thumbnails
11
+ ._*
12
+
13
+ # Files that might appear in the root of a volume
14
+ .DocumentRevisions-V100
15
+ .fseventsd
16
+ .Spotlight-V100
17
+ .TemporaryItems
18
+ .Trashes
19
+ .VolumeIcon.icns
20
+ .com.apple.timemachine.donotpresent
21
+
22
+ # Directories potentially created on remote AFP share
23
+ .AppleDB
24
+ .AppleDesktop
25
+ Network Trash Folder
26
+ Temporary Items
27
+ .apdisk
28
+
29
+ # Byte-compiled / optimized / DLL files
30
+ __pycache__/
31
+ *.py[cod]
32
+ *$py.class
33
+
34
+ # C extensions
35
+ *.so
36
+
37
+ # Distribution / packaging
38
+ .Python
39
+ env/
40
+ build/
41
+ develop-eggs/
42
+ dist/
43
+ downloads/
44
+ eggs/
45
+ .eggs/
46
+ lib/
47
+ lib64/
48
+ parts/
49
+ sdist/
50
+ var/
51
+ wheels/
52
+ *.egg-info/
53
+ .installed.cfg
54
+ *.egg
55
+
56
+ # PyInstaller
57
+ # Usually these files are written by a python script from a template
58
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
59
+ *.manifest
60
+ *.spec
61
+
62
+ # Installer logs
63
+ pip-log.txt
64
+ pip-delete-this-directory.txt
65
+
66
+ # Unit tests / coverage reports
67
+ htmlcov/
68
+ .tox/
69
+ .coverage
70
+ .coverage.*
71
+ .cache
72
+ nosetests.xml
73
+ coverage.xml
74
+ *.cover
75
+ .hypothesis/
76
+
77
+ # Translations
78
+ *.mo
79
+ *.pot
80
+
81
+ # Django stuff:
82
+ *.log
83
+ local_settings.py
84
+
85
+ # Flask stuff:
86
+ instance/
87
+ .webassets-cache
88
+
89
+ # Scrapy stuff:
90
+ .scrapy
91
+
92
+ # Sphinx documentation
93
+ docs/_build/
94
+ docs/build/
95
+ docs/docs/
96
+
97
+ # PyBuilder
98
+ target/
99
+
100
+ # Jupyter Notebook
101
+ .ipynb_checkpoints
102
+
103
+ # pyenv
104
+ .python-version
105
+
106
+ # celery beat schedule file
107
+ celerybeat-schedule
108
+
109
+ # SageMath parsed files
110
+ *.sage.py
111
+
112
+ # dotenv
113
+ .env
114
+
115
+ # virtualenv
116
+ .venv
117
+ venv/
118
+ ENV/
119
+
120
+ # Spyder project settings
121
+ .spyderproject
122
+ .spyproject
123
+
124
+ # Rope project settings
125
+ .ropeproject
126
+
127
+ # mkdocs documentation
128
+ /site
129
+
130
+ # mypy
131
+ .mypy_cache/
132
+
133
+ # datafiles
134
+ .xml
135
+ .pkl
136
+
137
+ # misc
138
+ .idea/
139
+ .iml
140
+ .dropbox
141
+
142
+ # media files
143
+ .png
144
+ .jpg
145
+ .pdf
146
+
147
+
148
+ # auto-generated by flows, all synced modules will be ignored by default
149
+ FLOW_MODULE_ID
CF_Code.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.application_flows import OpenAIChatAtomicFlow
2
+
3
+
4
+ class CF_Code(OpenAIChatAtomicFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
CF_Code.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Code_Flow"
2
+ description: |2-
3
+ Given a problem description, generate code directly.
4
+
5
+ # ~~~ Input interface specification ~~~
6
+ input_interface_non_initialized: # Applied when constructing the first user message.
7
+ - "problem_description"
8
+ - "input_description"
9
+ - "output_description"
10
+ - "io_examples_and_explanation"
11
+
12
+ input_interface_initialized: # Applied when constructing all subsequent user messages.
13
+ - "query"
14
+
15
+ # ~~~ Output interface specification ~~~
16
+ output_interface:
17
+ - "api_output"
18
+
19
+ # ~~~ Flow specification ~~~
20
+ model_name: "gpt-4"
21
+
22
+ generation_parameters:
23
+ n: 1
24
+ max_tokens: 2000
25
+ temperature: 0.3
26
+
27
+ model_kwargs:
28
+ top_p: 0.2
29
+ frequency_penalty: 0
30
+ presence_penalty: 0
31
+
32
+ system_message_prompt_template:
33
+ _target_: langchain.PromptTemplate
34
+ template: |2-
35
+ Your goal is to provide executable Python code that solves a competitive programming problem. The code should correctly handle all corner cases in order to pass the hidden test cases, which are used to evaluate the correctness of the solution.
36
+
37
+ The user will specify the problem by providing you with:
38
+ - the problem statement
39
+ - input description
40
+ - output description
41
+ - example test cases
42
+ - (optional) explanation of the test cases
43
+
44
+ The user will provide you with a task and an output format that you will strictly follow.
45
+ input_variables: []
46
+ template_format: jinja2
47
+
48
+ human_message_prompt_template:
49
+ _target_: langchain.PromptTemplate
50
+ template: "{{query}}"
51
+ input_variables:
52
+ - "query"
53
+ template_format: jinja2
54
+
55
+ init_human_message_prompt_template:
56
+ _target_: langchain.PromptTemplate
57
+ template: |2-
58
+ # Problem statement
59
+ {{problem_description}}
60
+
61
+ # Input description
62
+ {{input_description}}
63
+
64
+ # Output description
65
+ {{output_description}}
66
+
67
+ {{io_examples_and_explanation}}
68
+
69
+
70
+ The input should be read from the standard input and the output should be passed to the standard output.
71
+ Return Python code that solves the problem. Reply in the following format:
72
+ ```python
73
+ {{code_placeholder}}
74
+ ```
75
+ input_variables:
76
+ - "problem_description"
77
+ - "input_description"
78
+ - "output_description"
79
+ - "io_examples_and_explanation"
80
+ partial_variables:
81
+ code_placeholder: "{{python_code}}"
82
+ template_format: jinja2
CF_CodeCollab.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.base_flows import GeneratorCriticFlow
2
+
3
+
4
+ class CF_CodeCollab(GeneratorCriticFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
CF_CodeCollab.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeCollab_Flow"
2
+ description: |2-
3
+ Given a problem description, alternate between a step in which code is generated, and a step in which the produced code is evaluated and useful feedback is provided.
4
+
5
+
6
+ max_rounds: 4
7
+
8
+ input_interface:
9
+ - "problem_description"
10
+ - "input_description"
11
+ - "output_description"
12
+ - "io_examples_and_explanation"
13
+ output_interface:
14
+ - "code"
15
+
16
+ subflows_config:
17
+ CodeGenerator:
18
+ _target_: .CF_Code.instantiate_from_default_config
19
+ name: "CodeGenerator"
20
+ human_message_prompt_template:
21
+ _target_: langchain.PromptTemplate
22
+ template: |2-
23
+ # Feedback on the last proposed solution
24
+ {{code_feedback}}
25
+
26
+
27
+ Consider the original problem statement, the last proposed solution and the provided feedback. Does the solution need to be updated? If so, provide the corrected version of the code in the following format:
28
+ ```python
29
+ {{code_placeholder}}
30
+ ```
31
+ otherwise, reply:
32
+ "Final answer."
33
+ input_variables:
34
+ - code_feedback
35
+ partial_variables:
36
+ code_placeholder: "{{python_code}}"
37
+ input_interface_initialized:
38
+ - "code_feedback"
39
+ CodeCritic:
40
+ _target_: .CF_CodeCritic.instantiate_from_default_config
41
+
42
+ topology:
43
+ # ~~~ Code Generator ~~~
44
+ - goal: "Generate/refine a solution."
45
+
46
+ ### Input Interface
47
+ input_interface:
48
+ _target_: flows.interfaces.KeyInterface
49
+ additional_transformations:
50
+ - _target_: flows.data_transformations.KeyMatchInput
51
+
52
+ ### Flow Specification
53
+ flow: CodeGenerator
54
+
55
+ ### Output Interface
56
+ output_interface:
57
+ _target_: flows.interfaces.KeyInterface
58
+ additional_transformations:
59
+ - _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
60
+ regex: '(?<=```python)([\s\S]*?)(?=```)'
61
+ regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
62
+ input_key: "api_output"
63
+ output_key: "code"
64
+ strip: True
65
+ assert_unique: True
66
+ - _target_: flows.data_transformations.EndOfInteraction
67
+ end_of_interaction_string: "Final answer"
68
+ input_key: "api_output"
69
+ output_key: "end_of_interaction"
70
+ - _target_: flows.data_transformations.PrintPreviousMessages
71
+ reset: false
72
+
73
+ # ~~~ Code Critic ~~~
74
+ - goal: "Provide feedback for the candidate solution."
75
+
76
+ ### Input Interface
77
+ input_interface:
78
+ _target_: flows.interfaces.KeyInterface
79
+ additional_transformations:
80
+ - _target_: flows.data_transformations.KeyMatchInput
81
+
82
+ ### Flow Specification
83
+ flow: CodeCritic
84
+
85
+ ### Output Interface
86
+ output_interface:
87
+ _target_: flows.interfaces.KeyInterface
88
+ keys_to_rename:
89
+ api_output: "code_feedback"
90
+
91
+ reset: true
92
+
93
+ early_exit_key: "end_of_interaction"
CF_CodeCritic.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.application_flows import OpenAIChatAtomicFlow
2
+
3
+
4
+ class CF_CodeCritic(OpenAIChatAtomicFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
CF_CodeCritic.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeCritic_Flow"
2
+ description: |2-
3
+ Given a problem description and a solution candidate, provide useful feedback concerning the correctness of the solution candidate.
4
+
5
+ # ~~~ Input interface specification ~~~
6
+ input_interface_non_initialized:
7
+ - "problem_description"
8
+ - "input_description"
9
+ - "output_description"
10
+ - "io_examples_and_explanation"
11
+ - "code"
12
+
13
+ input_interface_initialized:
14
+ - "query"
15
+
16
+ # ~~~ Output interface specification ~~~
17
+ output_interface:
18
+ - "api_output"
19
+
20
+ # ~~~ Flow specification ~~~
21
+ model_name: "gpt-4"
22
+
23
+ generation_parameters:
24
+ n: 1
25
+ max_tokens: 3000
26
+ temperature: 0.3
27
+
28
+ model_kwargs:
29
+ top_p: 0.2
30
+ frequency_penalty: 0
31
+ presence_penalty: 0
32
+
33
+ system_message_prompt_template:
34
+ _target_: langchain.PromptTemplate
35
+ template: |2-
36
+ Your goal is to identify potential issues with a competitive programming solution attempt.
37
+
38
+ The user will specify the problem by providing you with:
39
+ - the problem statement
40
+ - input description
41
+ - output description
42
+ - example test cases
43
+ - (optional) explanation of the test cases
44
+ - a Python solution attempt
45
+
46
+ Crucially, your goal is to correctly identify potential issues with the solution attempt, and not to provide the code implementation yourself.
47
+ The user will provide you with a task and an output format that you will strictly follow.
48
+ input_variables: []
49
+ template_format: jinja2
50
+
51
+ human_message_prompt_template:
52
+ _target_: langchain.PromptTemplate
53
+ template: "{{query}}"
54
+ input_variables:
55
+ - "query"
56
+ template_format: jinja2
57
+
58
+ init_human_message_prompt_template:
59
+ _target_: langchain.PromptTemplate
60
+ template: |2-
61
+ # Problem statement
62
+ {{problem_description}}
63
+
64
+ # Input description
65
+ {{input_description}}
66
+
67
+ # Output description
68
+ {{output_description}}
69
+
70
+ {{io_examples_and_explanation}}
71
+
72
+ # Python solution attempt:
73
+ ```python
74
+ {{code}}
75
+ ```
76
+
77
+
78
+ Consider the problem statement and the solution attempt. Are there any issues with the proposed solution or it is correct? Explain your reasoning very concisely, and do not provide code.
79
+ input_variables:
80
+ - "problem_description"
81
+ - "input_description"
82
+ - "output_description"
83
+ - "io_examples_and_explanation"
84
+ - "code"
85
+ template_format: jinja2
86
+
87
+
CF_CodeCriticWrongAttempt.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.application_flows import OpenAIChatAtomicFlow
2
+
3
+
4
+ class CF_CodeCriticWrongAttempt(OpenAIChatAtomicFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
CF_CodeCriticWrongAttempt.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeCriticWrongAttempt_Flow"
2
+ description: |2-
3
+ Given a problem description and an incorrect solution candidate, provide useful feedback for correcting the mistakes in the solution.
4
+
5
+ # ~~~ Input interface specification ~~~
6
+ input_interface_non_initialized:
7
+ - "problem_description"
8
+ - "input_description"
9
+ - "output_description"
10
+ - "io_examples_and_explanation"
11
+ - "testing_results_summary"
12
+ - "code"
13
+
14
+ input_interface_initialized:
15
+ - "query"
16
+
17
+ # ~~~ Output interface specification ~~~
18
+ output_interface:
19
+ - "api_output"
20
+
21
+ # ~~~ Flow specification ~~~
22
+ model_name: "gpt-4"
23
+
24
+ generation_parameters:
25
+ n: 1
26
+ max_tokens: 3000
27
+ temperature: 0.3
28
+
29
+ model_kwargs:
30
+ top_p: 0.2
31
+ frequency_penalty: 0
32
+ presence_penalty: 0
33
+
34
+ system_message_prompt_template:
35
+ _target_: langchain.PromptTemplate
36
+ template: |2-
37
+ Your goal is to identify the issues with an incorrect competitive programming solution attempt.
38
+
39
+ The user will specify the problem by providing you with:
40
+ - the problem statement
41
+ - input description
42
+ - output description
43
+ - example test cases
44
+ - (optional) explanation of the test cases
45
+ - an incorrect Python solution attempt and a description of its issue
46
+
47
+ Crucially, your goal is to consider all aspects of the problem and pinpoint the issues with the solution attempt, and not to provide the code implementation yourself.
48
+ Some aspects to consider: Is the input correctly parsed? Is the output correctly formatted? Are the corner cases correctly handled? Is there a logical mistake with the algorithm itself?
49
+ Use the code execution results provided in the issue description to guide your reasoning/debugging.
50
+ input_variables: []
51
+ template_format: jinja2
52
+
53
+ human_message_prompt_template:
54
+ _target_: langchain.PromptTemplate
55
+ template: "{{query}}"
56
+ input_variables:
57
+ - "query"
58
+ template_format: jinja2
59
+
60
+ init_human_message_prompt_template:
61
+ _target_: langchain.PromptTemplate
62
+ template: |2-
63
+ # Problem statement
64
+ {{problem_description}}
65
+
66
+ # Input description
67
+ {{input_description}}
68
+
69
+ # Output description
70
+ {{output_description}}
71
+
72
+ {{io_examples_and_explanation}}
73
+
74
+ # Solution attempt to be fixed
75
+ ```python
76
+ {{code}}
77
+ ```
78
+
79
+ {{testing_results_summary}}
80
+
81
+
82
+ Consider the problem statement, the solution attempt and the issue. Why is the solution attempt incorrect? How should it be fixed? Explain your reasoning very concisely, and do not provide code.
83
+ input_variables:
84
+ - "problem_description"
85
+ - "input_description"
86
+ - "output_description"
87
+ - "io_examples_and_explanation"
88
+ - "code"
89
+ - "testing_results_summary"
90
+ template_format: jinja2
CF_CodeDebug.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flows.base_flows import GeneratorCriticFlow
2
+
3
+
4
+ class CF_CodeDebug(GeneratorCriticFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
7
+
8
+ def _early_exit(self):
9
+ if self.flow_state.get("all_tests_passed", False):
10
+ return True
11
+
12
+ return super()._early_exit()
CF_CodeDebug.yaml ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeDebug_Flow"
2
+ description: |2-
3
+ Given a problem description, generate code, test and refine it until all tests pass or a maximum number of rounds is reached.
4
+
5
+ # ~~~ Input interface specification ~~~
6
+ input_interface:
7
+ - "problem_description"
8
+ - "input_description"
9
+ - "output_description"
10
+ - "io_examples_and_explanation"
11
+ - "public_tests_individual_io"
12
+
13
+ # ~~~ Output interface specification ~~~
14
+ output_interface:
15
+ - "code"
16
+
17
+ # ~~~ Flow specification ~~~
18
+ max_rounds: 4
19
+
20
+ ### Subflows specification
21
+ subflows_config:
22
+ CodeGenerator:
23
+ _target_: .CF_Code.instantiate_from_default_config
24
+ name: "CodeGenerator"
25
+ model_name: "gpt-4"
26
+ human_message_prompt_template:
27
+ template: |2-
28
+ {{testing_results_summary}}
29
+
30
+
31
+ Consider the problem statement, the last proposed solution, and its issue. Provide a corrected version of the code that solves the original problem and resolves the issue, without any explanation, in the following format:
32
+ ```python
33
+ {{code_placeholder}}
34
+ ```
35
+ input_variables:
36
+ - testing_results_summary
37
+ partial_variables:
38
+ code_placeholder: "{{python_code}}"
39
+ input_interface_initialized:
40
+ - "testing_results_summary"
41
+ CodeTestingCritic:
42
+ _target_: .CF_CodeTesting.instantiate_from_default_config
43
+
44
+ ### Topology specification (specifies how the sequence of messages will flow from one of the subflows to another)
45
+ topology:
46
+ # ~~~ Code Generator ~~~
47
+ - goal: "Generate/refine a solution."
48
+
49
+ ### Input Interface
50
+ input_interface:
51
+ _target_: flows.interfaces.KeyInterface
52
+ additional_transformations:
53
+ - _target_: flows.data_transformations.KeyMatchInput
54
+
55
+ ### Flow Specification
56
+ flow: CodeGenerator
57
+
58
+ ### Output Interface
59
+ output_interface:
60
+ _target_: flows.interfaces.KeyInterface
61
+ additional_transformations:
62
+ - _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
63
+ regex: '(?<=```python)([\s\S]*?)(?=```)'
64
+ regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
65
+ input_key: "api_output"
66
+ output_key: "code"
67
+ strip: True
68
+ assert_unique: True
69
+ - _target_: flows.data_transformations.PrintPreviousMessages
70
+ keys_to_select:
71
+ - "code"
72
+
73
+ ### Reset flag
74
+ reset: false
75
+
76
+ # ~~~ Code Testing Critic ~~~
77
+ - goal: "Test the code on the public tests and provide a results summary."
78
+
79
+ ### Input Interface
80
+ input_interface:
81
+ _target_: flows.interfaces.KeyInterface
82
+ additional_transformations:
83
+ - _target_: flows.data_transformations.KeyMatchInput
84
+
85
+ ### Flow Specification
86
+ flow: CodeTestingCritic
87
+
88
+ ### Output Interface
89
+ output_interface:
90
+ _target_: flows.interfaces.KeyInterface
91
+ additional_transformations:
92
+ - _target_: martinjosifoski.CC_flows.src.data_transformations.CorrectnessFlag
93
+ input_key: "public_tests_results"
94
+ output_key: "all_tests_passed"
95
+ - _target_: martinjosifoski.CC_flows.src.data_transformations.TestingResultsSummaryGeneration
96
+ output_key: "testing_results_summary"
97
+
98
+ single_test_error_message: True
99
+
100
+ no_error_template: |2-
101
+ ${.issue_title}
102
+ All of the executed tests passed.
103
+
104
+ compilation_error_template: |2-
105
+ ${.issue_title}
106
+ The execution resulted in a compilation error.
107
+ ## Compilation error message:
108
+ {{error_message}}
109
+ timeout_error_template: |2-
110
+ ${.issue_title}
111
+ The execution timed out, the solution is not efficient enough.
112
+ runtime_error_template: |2-
113
+ ${.issue_title}
114
+ The execution resulted in a runtime error on the following test.
115
+ ## [Failed test] Input
116
+ ```
117
+ {{test_input}}
118
+ ```
119
+ ## [Failed test] Runtime error message
120
+ {{error_message}}
121
+ single_test_error_template: |2-
122
+ ${.issue_title}
123
+ The Python code does not solve the problem in the problem description due to logical errors. It fails the following test:
124
+ ## [Failed test] Input
125
+ ```
126
+ {{test_input}}
127
+ ```
128
+ ## [Failed test] Expected output
129
+ ```
130
+ {{expected_output}}
131
+ ```
132
+ ## [Failed test] Generated output
133
+ ```
134
+ {{generated_output}}
135
+ ```
136
+ all_tests_header: |2-
137
+ ${.issue_title}
138
+ The Python code does not solve the problem in the problem description due to logical errors. It fails on the following tests.
139
+ test_error_template: |2-
140
+ ## [Failed test {{idx}}]
141
+ ### [Failed test {{idx}}] Input
142
+ ```
143
+ {{test_input}}
144
+ ```
145
+ ### [Failed test {{idx}}] Expected output
146
+ ```
147
+ {{expected_output}}
148
+ ```
149
+ ### [Failed test {{idx}}] Generated output
150
+ ```
151
+ {{generated_output}}
152
+ ```
153
+ tests_separator: "\n\n"
154
+
155
+ issue_title: "# Issue with the last proposed solution"
156
+
157
+ ### Reset flag
158
+ reset: true
CF_CodeDebugCollab.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flows.base_flows import GeneratorCriticFlow
2
+
3
+
4
+ class CF_CodeDebugCollab(GeneratorCriticFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
7
+
8
+ def _early_exit(self):
9
+ if self.flow_state.get("all_tests_passed", False):
10
+ return True
11
+
12
+ return super()._early_exit()
CF_CodeDebugCollab.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeDebugCollab_Flow"
2
+ description: |2-
3
+ Given a problem description, alternate between a step in which code is generated, and a step in which the produced code is evaluated and useful feedback is provided.
4
+
5
+ # ~~~ Input interface specification ~~~
6
+ input_interface:
7
+ - "problem_description"
8
+ - "input_description"
9
+ - "output_description"
10
+ - "io_examples_and_explanation"
11
+ - "public_tests_individual_io"
12
+
13
+ # ~~~ Output interface specification ~~~
14
+ output_interface:
15
+ - "code"
16
+
17
+ # ~~~ Flow specification ~~~
18
+ max_rounds: 4
19
+
20
+ subflows_config:
21
+ CodeGenerator:
22
+ _target_: .CF_Code.instantiate_from_default_config
23
+ name: "CodeGenerator"
24
+ model_name: "gpt-4"
25
+ human_message_prompt_template:
26
+ _target_: langchain.PromptTemplate
27
+ template: |2-
28
+ {{testing_results_summary}}
29
+
30
+ {{code_feedback}}
31
+
32
+
33
+ Consider the problem statement, the last proposed solution, its issue and the provided feedback. Return a corrected version of the code that solves the original problem and resolves the issue, without any explanation, in the following format:
34
+ ```python
35
+ {{code_placeholder}}
36
+ ```
37
+ input_variables:
38
+ - code_feedback
39
+ - testing_results_summary
40
+ partial_variables:
41
+ code_placeholder: "{{python_code}}"
42
+ input_interface_initialized:
43
+ - "code_feedback"
44
+ - "testing_results_summary"
45
+ CodeDebugCritic:
46
+ _target_: .CF_CodeDebugCritic.instantiate_from_default_config
47
+
48
+ topology:
49
+ # ~~~ Code Generator ~~~
50
+ - goal: "Generate/refine a solution."
51
+
52
+ ### Input Interface
53
+ input_interface:
54
+ _target_: flows.interfaces.KeyInterface
55
+ additional_transformations:
56
+ - _target_: flows.data_transformations.KeyMatchInput
57
+
58
+ ### Flow Specification
59
+ flow: CodeGenerator
60
+
61
+ ### Output Interface
62
+ output_interface:
63
+ _target_: flows.interfaces.KeyInterface
64
+ additional_transformations:
65
+ - _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
66
+ regex: '(?<=```python)([\s\S]*?)(?=```)'
67
+ regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
68
+ input_key: "api_output"
69
+ output_key: "code"
70
+ strip: True
71
+ assert_unique: True
72
+ keys_to_select:
73
+ - "code"
74
+
75
+ reset: false
76
+
77
+ # ~~~ Code Critic Grounded in Tests ~~~
78
+ - goal: "Provide feedback for the candidate solution that is grounded in test results."
79
+
80
+ ### Input Interface
81
+ input_interface:
82
+ _target_: flows.interfaces.KeyInterface
83
+ additional_transformations:
84
+ - _target_: flows.data_transformations.KeyMatchInput
85
+
86
+ ### Flow Specification
87
+ flow: CodeDebugCritic
88
+
89
+ reset: true
CF_CodeDebugCritic.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flows.base_flows import SequentialFlow
2
+
3
+
4
+ class CF_CodeDebugCritic(SequentialFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
7
+
8
+ def _early_exit(self):
9
+ if self.flow_state.get("all_tests_passed", False):
10
+ self.flow_state["code_feedback"] = None
11
+ return True
12
+
13
+ return super()._early_exit()
CF_CodeDebugCritic.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeDebugCritic_Flow"
2
+ description: "Given a problem description and a candidate solution, test the code and provide useful feedback concerning the correctness of the solution and the potential mistakes."
3
+
4
+ # ~~~ Input interface specification ~~~
5
+ input_interface:
6
+ - "problem_description"
7
+ - "input_description"
8
+ - "output_description"
9
+ - "io_examples_and_explanation"
10
+ - "public_tests_individual_io"
11
+ - "code"
12
+
13
+ # ~~~ Output interface specification ~~~
14
+ output_interface:
15
+ - "testing_results_summary"
16
+ - "all_tests_passed"
17
+ - "code_feedback"
18
+
19
+ # ~~~ Flow specification ~~~
20
+ public_tests_key: "public_tests_individual_io"
21
+
22
+ subflows_config:
23
+ CodeTestingCritic:
24
+ _target_: .CF_CodeTesting.instantiate_from_default_config
25
+ CodeCriticWrongAttempt:
26
+ _target_: .CF_CodeCriticWrongAttempt.instantiate_from_default_config
27
+
28
+ topology:
29
+ # ~~~ Code Testing Critic ~~~
30
+ - goal: "Test the code on the public tests and provide a results summary."
31
+
32
+ ### Input Interface
33
+ input_interface:
34
+ _target_: flows.interfaces.KeyInterface
35
+ additional_transformations:
36
+ - _target_: flows.data_transformations.KeyMatchInput
37
+
38
+ ### Flow Specification
39
+ flow: CodeTestingCritic
40
+
41
+ ### Output Interface
42
+ output_interface:
43
+ _target_: flows.interfaces.KeyInterface
44
+ additional_transformations:
45
+ - _target_: martinjosifoski.CC_flows.src.data_transformations.CorrectnessFlag
46
+ input_key: "public_tests_results"
47
+ output_key: "all_tests_passed"
48
+ - _target_: martinjosifoski.CC_flows.src.data_transformations.TestingResultsSummaryGeneration
49
+ output_key: "testing_results_summary"
50
+
51
+ single_test_error_message: True
52
+
53
+ no_error_template: |2-
54
+ ${.issue_title}
55
+ All of the executed tests passed.
56
+
57
+ compilation_error_template: |2-
58
+ ${.issue_title}
59
+ The execution resulted in a compilation error.
60
+ ## Compilation error message:
61
+ {{error_message}}
62
+ timeout_error_template: |2-
63
+ ${.issue_title}
64
+ The execution timed out, the solution is not efficient enough.
65
+ runtime_error_template: |2-
66
+ ${.issue_title}
67
+ The execution resulted in a runtime error on the following test.
68
+ ## [Failed test] Input
69
+ ```
70
+ {{test_input}}
71
+ ```
72
+ ## [Failed test] Runtime error message
73
+ {{error_message}}
74
+ single_test_error_template: |2-
75
+ ${.issue_title}
76
+ The Python code does not solve the problem in the problem description due to logical errors. It fails the following test:
77
+ ## [Failed test] Input
78
+ ```
79
+ {{test_input}}
80
+ ```
81
+ ## [Failed test] Expected output
82
+ ```
83
+ {{expected_output}}
84
+ ```
85
+ ## [Failed test] Generated output
86
+ ```
87
+ {{generated_output}}
88
+ ```
89
+ all_tests_header: |2-
90
+ ${.issue_title}
91
+ The Python code does not solve the problem in the problem description due to logical errors. It fails on the following tests.
92
+ test_error_template: |2-
93
+ ## [Failed test {{idx}}]
94
+ ### [Failed test {{idx}}] Input
95
+ ```
96
+ {{test_input}}
97
+ ```
98
+ ### [Failed test {{idx}}] Expected output
99
+ ```
100
+ {{expected_output}}
101
+ ```
102
+ ### [Failed test {{idx}}] Generated output
103
+ ```
104
+ {{generated_output}}
105
+ ```
106
+ tests_separator: "\n\n"
107
+
108
+ issue_title: "# Issue with the last proposed solution"
109
+
110
+ # ~~~ Feedback Generator ~~~
111
+ - goal: "Generate feedback grounded in the test results summary."
112
+
113
+ ### Input Interface
114
+ input_interface:
115
+ _target_: flows.interfaces.KeyInterface
116
+ additional_transformations:
117
+ - _target_: flows.data_transformations.KeyMatchInput
118
+
119
+ ### Flow Specification
120
+ flow: CodeCriticWrongAttempt
121
+
122
+ ### Output Interface
123
+ output_interface:
124
+ _target_: flows.interfaces.KeyInterface
125
+ additional_transformations:
126
+ - _target_: flows.data_transformations.KeyRename
127
+ old_key2new_key:
128
+ api_output: "code_feedback"
129
+
130
+ reset: true
131
+
CF_CodeReflect.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.base_flows import GeneratorCriticFlow
2
+
3
+
4
+ class CF_CodeReflect(GeneratorCriticFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
CF_CodeReflect.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeReflect_Flow"
2
+ description: "Given a problem description, generate code, reflecting on it and improving it until a message suggesting that the code seems correct or a maximum number of rounds is reached."
3
+
4
+ # ~~~ Input interface specification ~~~
5
+ input_interface:
6
+ - "problem_description"
7
+ - "input_description"
8
+ - "output_description"
9
+ - "io_examples_and_explanation"
10
+
11
+ # ~~~ Output interface specification ~~~
12
+ output_interface:
13
+ - "code"
14
+
15
+ # ~~~ Flow specification ~~~
16
+ max_rounds: 4
17
+
18
+ ### Subflows specification
19
+ subflows_config:
20
+ CodeGenerator:
21
+ _target_: .CF_Code.instantiate_from_default_config
22
+ CodeReflectCritic:
23
+ _target_: .FixedReply_CodeReflect.instantiate_from_default_config
24
+
25
+ ### Topology specification (specifies how the sequence of messages will flow from one of the subflows to another)
26
+ topology:
27
+ # ~~~ Code Generator ~~~
28
+ - goal: "Generate/refine a solution."
29
+
30
+ ### Input Interface
31
+ input_interface:
32
+ _target_: flows.interfaces.KeyInterface
33
+ additional_transformations:
34
+ - _target_: flows.data_transformations.KeyMatchInput
35
+ keys_to_rename:
36
+ code_reflect_message: "query"
37
+
38
+ ### Flow Specification
39
+ flow: CodeGenerator
40
+
41
+ ### Output Interface
42
+ output_interface:
43
+ _target_: flows.interfaces.KeyInterface
44
+ additional_transformations:
45
+ - _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
46
+ regex: '(?<=```python)([\s\S]*?)(?=```)'
47
+ regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
48
+ input_key: "api_output"
49
+ output_key: "code"
50
+ strip: True
51
+ assert_unique: True
52
+ - _target_: flows.data_transformations.EndOfInteraction
53
+ end_of_interaction_string: "Final answer"
54
+ input_key: "api_output"
55
+ output_key: "end_of_interaction"
56
+ - _target_: flows.data_transformations.PrintPreviousMessages
57
+ keys_to_select:
58
+ - "code"
59
+ - "end_of_interaction"
60
+
61
+ ### Reset flag
62
+ reset: false
63
+
64
+ - goal: "Generate a message that encourages reflection."
65
+
66
+ ### Input Interface
67
+ input_interface:
68
+ _target_: flows.interfaces.KeyInterface
69
+ additional_transformations:
70
+ - _target_: flows.data_transformations.KeyMatchInput
71
+
72
+
73
+ ### Flow Specification
74
+ flow: CodeReflectCritic
75
+
76
+ ### Output Interface
77
+ output_interface:
78
+ _target_: flows.interfaces.KeyInterface
79
+ keys_to_rename:
80
+ fixed_reply: "code_reflect_message"
81
+
82
+ ### Reset flag
83
+ reset: true
84
+
85
+ early_exit_key: "end_of_interaction"
CF_CodeTesting.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+
3
+ from flows import logging
4
+ from flows.utils.general_helpers import validate_parameters
5
+ from .src.evaluation import testing_utils_codeforces
6
+ from .CodeTesting import CodeTesting
7
+
8
+ log = logging.get_logger(__name__)
9
+
10
+
11
+ class CF_CodeTesting(CodeTesting):
12
+ REQUIRED_KEYS_CONFIG = []
13
+ REQUIRED_KEYS_CONSTRUCTOR = []
14
+
15
+ def __init__(self, **kwargs):
16
+ super().__init__(**kwargs)
17
+
18
+ @classmethod
19
+ def _validate_parameters(cls, kwargs):
20
+ validate_parameters(cls, kwargs)
21
+
22
+ if "public_tests_key" not in kwargs["flow_config"] and "hidden_tests_key" not in kwargs["flow_config"]:
23
+ raise ValueError("At least one of 'public_tests_key' "
24
+ "and 'hidden_tests_key' must be specified in the config.")
25
+
26
+ def _get_test_data(self, input_data: Dict):
27
+ """This function retrieves (or generates) input-output pairs that will be used to test the implementation."""
28
+ test_data = {"public_tests_io": None, "hidden_tests_io": None}
29
+
30
+ if "public_tests_key" in self.flow_config:
31
+ test_data["public_tests_io"] = input_data[self.flow_config["public_tests_key"]]
32
+
33
+ if "hidden_tests_key" in self.flow_config:
34
+ test_data["hidden_tests_io"] = input_data[self.flow_config["hidden_tests_key"]]
35
+
36
+ return test_data
37
+
38
+ def _run_tests(self, input_data: Dict, test_data: Dict) -> Dict[str, Any]:
39
+ testing_results = testing_utils_codeforces.evaluate_solution_for_problem(
40
+ candidate_solution=input_data["code"],
41
+ **test_data
42
+ )
43
+
44
+ if "public_tests_results" in testing_results:
45
+ for test_output in testing_results["public_tests_results"]:
46
+ test_output["input"] = "\n".join(test_output["input"])
47
+
48
+ if "hidden_tests_results" in testing_results:
49
+ for test_output in testing_results["hidden_tests_results"]:
50
+ test_output["input"] = "\n".join(test_output["input"])
51
+
52
+ return testing_results
CF_CodeTesting.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CF_CodeTesting"
2
+ description: "Given code and test specified by input-output pairs, executes the code with the specific input, compares it with the output, and returns an informative message."
3
+
4
+ input_interface:
5
+ - "code"
6
+ - "public_tests_individual_io"
7
+
8
+ output_interface:
9
+ - "all_tests_passed"
10
+ - "testing_results_summary"
11
+
12
+ public_tests_key: "public_tests_individual_io"
CodeTesting.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+
3
+ from flows import logging
4
+ from flows.base_flows import AtomicFlow
5
+
6
+ log = logging.get_logger(__name__)
7
+
8
+
9
+ class CodeTesting(AtomicFlow):
10
+ REQUIRED_KEYS_CONFIG = []
11
+ REQUIRED_KEYS_CONSTRUCTOR = []
12
+
13
+ def __init__(self, **kwargs):
14
+ super().__init__(**kwargs)
15
+
16
+ def _get_test_data(self, input_data: Dict):
17
+ """This function retrieves (or generates) input-output pairs that will be used to test the implementation."""
18
+ raise NotImplementedError()
19
+
20
+ def _run_tests(self, input_data: Dict, test_data: Dict) -> Dict[str, Any]:
21
+ raise NotImplementedError()
22
+
23
+ def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
24
+ # ~~~ Retrieve the test data ~~~
25
+ test_data = self._get_test_data(input_data)
26
+
27
+ # ~~~ Run tests ~~~
28
+ response: Dict[str, Any] = self._run_tests(input_data, test_data)
29
+
30
+ return response
FixedReply_CodeReflect.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from flows.base_flows import FixedReplyFlow
2
+
3
+
4
+ class FixedReply_CodeReflect(FixedReplyFlow):
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
FixedReply_CodeReflect.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeReflectCritic"
2
+ description: "A flow that prompts the caller to reflect on the generated code and provide a corrected version if necessary."
3
+
4
+ input_interface: []
5
+ output_interface:
6
+ - fixed_reply
7
+
8
+ fixed_reply: |2-
9
+ Consider the problem statement and the last proposed solution. Are you sure that the solution is provided in the requested format, and crucially, solves the problem?
10
+ If that is not the case, provide the corrected version of the code in the following format:
11
+ ```python
12
+ {{python_code}}
13
+ ```
14
+ otherwise, reply:
15
+ "Final answer."
16
+
17
+
18
+
__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ~~~ Codeforces ~~~
2
+ # code
3
+ from .CF_Code import CF_Code
4
+
5
+ # code_reflect
6
+ from .FixedReply_CodeReflect import FixedReply_CodeReflect
7
+ from .CF_CodeReflect import CF_CodeReflect
8
+
9
+ # code_collab
10
+ from .CF_CodeCritic import CF_CodeCritic
11
+ from .CF_CodeCollab import CF_CodeCollab
12
+
13
+ # code_debug
14
+ from .CF_CodeTesting import CF_CodeTesting
15
+ from .CF_CodeDebug import CF_CodeDebug
16
+
17
+ # cf-code_debug_collab
18
+ from .CF_CodeCriticWrongAttempt import CF_CodeCriticWrongAttempt
19
+ from .CF_CodeDebugCritic import CF_CodeDebugCritic
20
+ from .CF_CodeDebugCollab import CF_CodeDebugCollab
src/__init__.py ADDED
File without changes
src/data_transformations/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .correctness_flag import CorrectnessFlag
2
+ from .testing_results_summary_generation import TestingResultsSummaryGeneration
src/data_transformations/correctness_flag.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ from flows.data_transformations.abstract import DataTransformation
4
+
5
+
6
+ class CorrectnessFlag(DataTransformation):
7
+ def __init__(self, output_key, input_key):
8
+ super().__init__(output_key)
9
+ self.input_key = input_key
10
+
11
+ def __call__(self, data_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
12
+ all_tests_passed = all([test_result["status"] for test_result in data_dict[self.input_key]])
13
+ data_dict[self.output_key] = all_tests_passed
14
+ return data_dict
src/data_transformations/testing_results_summary_generation.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import jinja2
4
+
5
+ from flows.data_transformations.abstract import DataTransformation
6
+ from flows.utils.general_helpers import unflatten_dict
7
+
8
+
9
+ class TestingResultsSummaryGeneration(DataTransformation):
10
+ def __init__(self, output_key, **kwargs):
11
+ super().__init__(output_key)
12
+ self.params = kwargs
13
+ if "test_results_key" not in self.params:
14
+ self.params["test_results_key"] = "public_tests_results"
15
+ if "tests_passed_key" not in self.params:
16
+ self.params["tests_passed_key"] = "all_tests_passed"
17
+
18
+ def __call__(self, data_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
19
+ if data_dict[self.params["tests_passed_key"]]:
20
+ # the execution did not result in any errors
21
+ data_dict[self.output_key] = self.params["no_error_template"]
22
+ return data_dict
23
+
24
+ test_data = unflatten_dict(data_dict)
25
+
26
+ if not test_data["compilation_status"]:
27
+ # compilation error occurred
28
+ kwargs = {
29
+ "error_message": test_data["compilation_error_message"].strip(),
30
+ }
31
+
32
+ message_content = (
33
+ jinja2.Environment(loader=jinja2.BaseLoader())
34
+ .from_string(self.params["compilation_error_template"])
35
+ .render(**kwargs)
36
+ )
37
+ elif test_data["timeout_error"]:
38
+ # timeout error occurred
39
+
40
+ message_content = self.params["timeout_error_template"]
41
+ else:
42
+ # code compiled successfully without timeouts
43
+
44
+ # retrieve the failed tests
45
+ failed_tests = [
46
+ test_result
47
+ for test_result in test_data[self.params["test_results_key"]]
48
+ if not test_result["status"]
49
+ ]
50
+
51
+ runtime_error_test = None
52
+ for test_result in failed_tests:
53
+ if test_result["generated_output"] is None:
54
+ # runtime error occurred
55
+ runtime_error_test = test_result
56
+
57
+ if runtime_error_test:
58
+ # construct the error message for the runtime error
59
+ kwargs = {
60
+ "test_input": runtime_error_test["input"],
61
+ "error_message": runtime_error_test["error_message"].strip(),
62
+ }
63
+
64
+ message_content = (
65
+ jinja2.Environment(loader=jinja2.BaseLoader())
66
+ .from_string(self.params["runtime_error_template"])
67
+ .render(**kwargs)
68
+ )
69
+ else:
70
+ # construct the error message corresponding to a logical error
71
+
72
+ if self.params["single_test_error_message"]:
73
+ # construct the error message for a single (the first) failed test
74
+ first_failed_test = failed_tests[0]
75
+
76
+ kwargs = {
77
+ "test_input": first_failed_test["input"],
78
+ "expected_output": first_failed_test["expected_output"],
79
+ "generated_output": first_failed_test["generated_output"],
80
+ }
81
+
82
+ message_content = (
83
+ jinja2.Environment(loader=jinja2.BaseLoader())
84
+ .from_string(self.params["single_test_error_template"])
85
+ .render(**kwargs)
86
+ )
87
+ else:
88
+ # construct the error message covering all failed tests
89
+ parts = [self.params["all_tests_header"]]
90
+
91
+ for idx, test_result in enumerate(failed_tests):
92
+ kwargs = {
93
+ "idx": idx + 1,
94
+ "test_input": test_result["input"],
95
+ "expected_output": test_result["expected_output"],
96
+ "generated_output": test_result["generated_output"],
97
+ }
98
+
99
+ parts.append(
100
+ jinja2.Environment(loader=jinja2.BaseLoader())
101
+ .from_string(self.params["test_error_template"])
102
+ .render(**kwargs)
103
+ )
104
+
105
+ message_content = self.params["tests_separator"].join(parts)
106
+ data_dict[self.output_key] = message_content
107
+ return data_dict
src/datasets/__init__.py ADDED
File without changes
src/datasets/schema.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict
2
+
3
+
4
+ def assert_test_format_codeforces(tests: List[Tuple[List[str], str]]):
5
+ assert isinstance(tests, list) or tests is None
6
+ if tests is None:
7
+ return
8
+ for test in tests:
9
+ assert isinstance(test, list)
10
+ assert len(test) == 2
11
+ inputs, outputs = test
12
+ assert isinstance(inputs, list)
13
+ assert isinstance(outputs, str)
14
+ for input in inputs:
15
+ assert isinstance(input, str)
16
+
17
+
18
+ def assert_entry_format_codeforces(obj: Dict):
19
+ # each data point must follow the same schema
20
+ assert isinstance(obj["id"], str) # contest + problem_name = id, will not change when formatting changes
21
+ assert isinstance(obj["id_hash"], str) # hashsum of all entries, any change to obj will change this
22
+ assert isinstance(obj["contest"], int)
23
+ assert isinstance(obj["problem_name"], str)
24
+ assert isinstance(obj["problem_url"], str)
25
+ assert isinstance(obj["solution_url"], str)
26
+
27
+ assert isinstance(obj["header"], str)
28
+ assert isinstance(obj["problem_description"], str)
29
+ assert isinstance(obj["input_description"], str)
30
+ assert isinstance(obj["output_description"], str)
31
+ assert isinstance(obj["note"], str) or obj["note"] is None
32
+
33
+ assert isinstance(obj["difficulty"], int)
34
+ assert isinstance(obj["tags"], list)
35
+ assert isinstance(obj["working_solution"], str) # can be empty
36
+
37
+ assert_test_format_codeforces(obj["public_tests_io"])
38
+ assert_test_format_codeforces(obj["public_tests_individual_io"])
39
+ assert_test_format_codeforces(obj["hidden_tests_io"])
40
+
41
+
42
+ def assert_test_format_leetcode(tests: List[Tuple[List[str], str]]):
43
+ pass
44
+ # ToDo: Uncomment after the test format is updated
45
+ # assert isinstance(tests, list)
46
+ # for test in tests:
47
+ # assert isinstance(test, tuple)
48
+ # assert len(test) == 2
49
+ # x, y = test
50
+ # assert isinstance(x, str)
51
+ # assert isinstance(y, str)
52
+
53
+
54
+ def assert_entry_format_leetcode(obj: Dict):
55
+ # each data point must follow the same schema
56
+ assert isinstance(obj["id"], str) # contest + problem_name = id, will not change when formatting changes
57
+ assert isinstance(obj["id_hash"], str) # hashsum of all entries, any change to obj will change this
58
+ assert isinstance(obj["index"], int)
59
+ assert isinstance(obj["problem_name"], str)
60
+ assert isinstance(obj["problem_url"], str)
61
+
62
+ assert isinstance(obj["problem_description"], str)
63
+ assert isinstance(obj["constraints"], str)
64
+ assert isinstance(obj["python_stub"], str)
65
+ assert isinstance(obj["difficulty"], str) and obj["difficulty"] in {"easy", "medium", "hard"}
66
+
67
+ # ToDo: Should be added
68
+ # assert isinstance(obj['tags'], list)
69
+ # assert isinstance(obj['solution_url'], str)
70
+ # assert isinstance(obj['working_solution'], str) # can be empty
71
+
72
+ # ToDo: Uncomment after the test format is updated
73
+ # assert_test_format_leetcode(obj['public_tests_io'])
74
+ # assert_test_format_leetcode(obj['hidden_tests_io'])
src/evaluation/__init__.py ADDED
File without changes
src/evaluation/testing_utils_codeforces.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is based heavily on the huggingface APPS metric
2
+ import re
3
+
4
+ # to run the solution files we're using a timing based approach
5
+ import signal
6
+ import sys
7
+
8
+ # for capturing the stdout
9
+ from io import StringIO
10
+ from typing import List, Tuple
11
+
12
+ # used for testing the code that reads from input
13
+ from unittest.mock import patch, mock_open
14
+
15
+ import numpy as np
16
+ from pyext import RuntimeModule
17
+ from wrapt_timeout_decorator import timeout as wrapt_timeout
18
+ import threading
19
+
20
+ from ..datasets.schema import assert_test_format_codeforces
21
+
22
+ from flows import logging
23
+
24
+ log = logging.get_logger(__name__)
25
+ lock = threading.Lock()
26
+
27
+
28
+ def evaluate_solution_for_problem(
29
+ candidate_solution,
30
+ hidden_tests_io=None,
31
+ public_tests_io=None,
32
+ timeout=10,
33
+ debug=False,
34
+ add_extra_imports=False,
35
+ allow_truncated_io=False,
36
+ ):
37
+ """See the readme for the output format of this function."""
38
+ if hidden_tests_io is None:
39
+ hidden_tests_io = []
40
+ if public_tests_io is None:
41
+ public_tests_io = []
42
+
43
+ if candidate_solution is None:
44
+ results_dict = {
45
+ "compilation_status": False,
46
+ "compilation_error_message": "No code was provided.",
47
+ "timeout_error": False,
48
+ "hidden_tests_results": [
49
+ {
50
+ "status": False,
51
+ "error_message": "No code was provided.",
52
+ "generated_output": None,
53
+ "input": test[0],
54
+ "expected_output": test[1],
55
+ }
56
+ for test in hidden_tests_io
57
+ ],
58
+ "public_tests_results": [
59
+ {
60
+ "status": False,
61
+ "error_message": "No code was provided.",
62
+ "generated_output": None,
63
+ "input": test[0],
64
+ "expected_output": test[1],
65
+ }
66
+ for test in public_tests_io
67
+ ],
68
+ }
69
+ return results_dict
70
+
71
+ @wrapt_timeout(timeout, use_signals=False)
72
+ def run_tests():
73
+ hidden_tests_results = check_correctness(
74
+ candidate_solution, hidden_tests_io, timeout, debug, add_extra_imports, allow_truncated_io
75
+ )
76
+ public_tests_results = check_correctness(
77
+ candidate_solution, public_tests_io, timeout, debug, add_extra_imports, allow_truncated_io
78
+ )
79
+
80
+ return hidden_tests_results, public_tests_results
81
+
82
+ try:
83
+ lock.acquire()
84
+ hidden_tests_results, public_tests_results = run_tests()
85
+ timeout_error_occurred = False
86
+ lock.release()
87
+ except BaseException as e:
88
+ lock.release()
89
+ log.info(e)
90
+ hidden_tests_results = {}
91
+ public_tests_results = {}
92
+
93
+ hidden_tests_results["compilation_status"] = True
94
+ public_tests_results["compilation_status"] = True
95
+ timeout_error_occurred = True
96
+ hidden_tests_results["error_message"] = "Timeout error."
97
+
98
+ hidden_tests_results["results"] = [
99
+ {
100
+ "status": False,
101
+ "error_message": hidden_tests_results["error_message"],
102
+ "generated_output": None,
103
+ "input": test[0],
104
+ "expected_output": test[1],
105
+ }
106
+ for test in hidden_tests_io
107
+ ]
108
+ public_tests_results["results"] = [
109
+ {
110
+ "status": False,
111
+ "error_message": hidden_tests_results["error_message"],
112
+ "generated_output": None,
113
+ "input": test[0],
114
+ "expected_output": test[1],
115
+ }
116
+ for test in public_tests_io
117
+ ]
118
+
119
+ # the compilation status shouldn't depend on the tests
120
+ assert hidden_tests_results["compilation_status"] == public_tests_results["compilation_status"]
121
+
122
+ results_dict = {
123
+ "compilation_status": hidden_tests_results["compilation_status"],
124
+ "compilation_error_message": hidden_tests_results["error_message"],
125
+ "timeout_error": timeout_error_occurred,
126
+ "hidden_tests_results": hidden_tests_results["results"],
127
+ "public_tests_results": public_tests_results["results"],
128
+ }
129
+
130
+ return results_dict
131
+
132
+
133
+ def check_correctness(
134
+ candidate_solution: str,
135
+ tests: List[Tuple[List[str], str]],
136
+ timeout: int = 6000,
137
+ debug=True,
138
+ add_extra_imports=False,
139
+ allow_truncated_io=True,
140
+ ):
141
+ """
142
+ wrapping the testing code in a global timeout, based on huggingface code
143
+ """
144
+
145
+ assert_test_format_codeforces(tests)
146
+ inputs, outputs = [], []
147
+ if len(tests) > 0:
148
+ inputs, outputs = zip(*tests)
149
+
150
+ compilation_error, results = run_test(
151
+ candidate_solution, inputs, outputs, timeout, debug, add_extra_imports, allow_truncated_io
152
+ )
153
+
154
+ assert len(results) == len(inputs)
155
+
156
+ for result in results:
157
+ assert isinstance(result["generated_output"], str) or result["generated_output"] is None
158
+ assert isinstance(result["status"], bool)
159
+ assert isinstance(result["error_message"], str) or result["error_message"] is None
160
+ assert isinstance(result["input"], list)
161
+ assert isinstance(result["expected_output"], str)
162
+
163
+ compilation_status = compilation_error == ""
164
+ if compilation_status:
165
+ compilation_error = None
166
+
167
+ return {"compilation_status": compilation_status, "error_message": compilation_error, "results": results}
168
+
169
+
170
+ class TimeoutException(Exception):
171
+ pass
172
+
173
+
174
+ def timeout_handler(signum, frame):
175
+ log.info("Alarm went off")
176
+ # return
177
+ raise TimeoutException
178
+
179
+
180
+ signal.signal(signal.SIGALRM, timeout_handler)
181
+
182
+
183
+ # used to capture stdout as a list
184
+ # from https://stackoverflow.com/a/16571630/6416660
185
+ # alternative use redirect_stdout() from contextlib
186
+ class Capturing(list):
187
+ def __enter__(self):
188
+ self._stdout = sys.stdout
189
+ sys.stdout = self._stringio = StringIO()
190
+ # Make closing the StringIO a no-op
191
+ self._stringio.close = lambda x: 1
192
+ return self
193
+
194
+ def __exit__(self, *args):
195
+ self.extend(self._stringio.getvalue().splitlines())
196
+ del self._stringio # free up some memory
197
+ sys.stdout = self._stdout
198
+
199
+
200
+ def run_test(code, inputs, outputs, timeout: int = 10, debug=True, add_extra_imports=False, allow_truncated_io=True):
201
+ """
202
+ runs the code and tries to match inputs and outputs
203
+ the scraped testcases may be incomplete
204
+ if allow_truncated_io==True, then we ignore an EOF exception at the end of the generated output
205
+ """
206
+ # Disable functionalities that can make destructive changes to the test.
207
+
208
+ results = []
209
+
210
+ if isinstance(code, list):
211
+ tmp_test = code
212
+ elif isinstance(code, str):
213
+ tmp_test = code.split("\n")
214
+ else:
215
+ raise AssertionError("code must be provided as list of lines or string with \\n linebreaks.")
216
+
217
+ # parse the code into code and imports
218
+ import_lines = []
219
+ future_import_lines = []
220
+ code_lines = []
221
+ for x in tmp_test:
222
+ if (not x.startswith("from ")) and (not x.startswith("import ")):
223
+ code_lines.append("\t" + x + "\n")
224
+ else:
225
+ if "__future__" in x:
226
+ future_import_lines.append(x + "\n")
227
+ else:
228
+ import_lines.append(x + "\n")
229
+
230
+ # assemble a new solution snippet which wraps the generated solution in a function code()
231
+ new_test = "stdin = sys.stdin\nstdout = sys.stdout\n"
232
+ new_test += '__name__="__main__"\n'
233
+ new_test += "def code():\n"
234
+ new_test += "\tstdin = sys.stdin\n\tstdout = sys.stdout\n"
235
+
236
+ for line in code_lines:
237
+ new_test += line
238
+
239
+ sol = "\n".join(future_import_lines)
240
+ sol += "import sys\n"
241
+ if add_extra_imports:
242
+ sol += "import time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
243
+ sol += "\n".join(import_lines) + "\n" + new_test
244
+
245
+ if debug:
246
+ log.info(f"sol = {sol}")
247
+ method_name = "code"
248
+ signal.alarm(timeout)
249
+
250
+ # convert the solution snippet into a pyext runtime module
251
+ sol_module = None
252
+ try:
253
+ sol_module = RuntimeModule.from_string("tmp_sol", "", sol)
254
+ signal.alarm(0)
255
+ except Exception as e:
256
+ signal.alarm(0)
257
+ if debug:
258
+ log.info(f"type 1 compilation error = {e}")
259
+ for inp, out in zip(inputs, outputs):
260
+ # consider all inputs failed
261
+ results.append(
262
+ {
263
+ "status": False,
264
+ "input": inp,
265
+ "expected_output": out,
266
+ "generated_output": None,
267
+ "error_message": repr(e),
268
+ }
269
+ )
270
+ return repr(e), results
271
+
272
+ assert sol_module is not None
273
+ signal.alarm(0)
274
+
275
+ try:
276
+ method = getattr(sol_module, method_name) # get_attr second arg must be str
277
+ except:
278
+ signal.alarm(0)
279
+ e = sys.exc_info()
280
+ log.info(f"unable to get function error = {e}")
281
+
282
+ for inp, out in zip(inputs, outputs):
283
+ # consider all inputs failed
284
+ results.append(
285
+ {
286
+ "status": False,
287
+ "input": inp,
288
+ "expected_output": out,
289
+ "generated_output": None,
290
+ "error_message": repr(e),
291
+ }
292
+ )
293
+ return repr(e), results
294
+
295
+ # go through all tests, call our runtime module with the inputs
296
+ # then compare with the reference output
297
+ for index, (test_input, reference_output) in enumerate(zip(inputs, outputs)):
298
+
299
+ result_object = {
300
+ "input": test_input,
301
+ "expected_output": reference_output,
302
+ }
303
+
304
+ # if the last token of the input is truncated and marked with "..." we delete it
305
+ input_truncated = False
306
+ if "".join(test_input).strip().endswith("...") and allow_truncated_io:
307
+ test_input = test_input[:-1]
308
+ input_truncated = True
309
+
310
+ # sometimes the last input token is ""
311
+ # if len(test_input)>0:
312
+ # if test_input[-1]=="":
313
+ # test_input = test_input[:-1]
314
+
315
+ error_code = None
316
+ with Capturing() as generated_output:
317
+ try:
318
+ call_method(method, test_input)
319
+ # reset the alarm
320
+ signal.alarm(0)
321
+ except Exception as e:
322
+ # runtime error or took too long
323
+ signal.alarm(0)
324
+ error_code = e
325
+ if debug:
326
+ log.info(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
327
+ signal.alarm(0)
328
+
329
+ # in some cases we run into truncated tests
330
+ # in such cases we expect the error code to be None, EOFError or ValueError
331
+ if (
332
+ (input_truncated or reference_output.strip().endswith("..."))
333
+ and allow_truncated_io
334
+ and (error_code is None or isinstance(error_code, EOFError) or isinstance(error_code, ValueError))
335
+ ):
336
+
337
+ generated_output = generated_output[:-1]
338
+ reference_output = reference_output.rstrip("...")
339
+ if len(generated_output) == 0:
340
+ # no output left, we pass by default
341
+ result_object.update(
342
+ **{
343
+ "status": True,
344
+ "generated_output": "\n".join(generated_output),
345
+ "error_message": None,
346
+ }
347
+ )
348
+ results.append(result_object)
349
+ else:
350
+ result_object.update(
351
+ **{
352
+ "status": string_compare(generated_output, reference_output, True),
353
+ "generated_output": "\n".join(generated_output),
354
+ "error_message": None,
355
+ }
356
+ )
357
+ results.append(result_object)
358
+
359
+ # if the input and output are not truncated, we don't allow any errors
360
+ elif error_code is not None:
361
+ result_object.update(**{"status": False, "generated_output": None, "error_message": repr(error_code)})
362
+ results.append(result_object)
363
+ # finally, if there are no errors, we expect the output to match the reference output
364
+ else:
365
+ # the execution went well, let's compare the outputs
366
+ result_object.update(
367
+ **{
368
+ "status": string_compare(generated_output, reference_output, False),
369
+ "generated_output": "\n".join(generated_output),
370
+ "error_message": None,
371
+ }
372
+ )
373
+ results.append(result_object)
374
+
375
+ return "", results
376
+
377
+
378
+ def string_compare(candidate, correct, truncate_output=False, floating_point_accuracy=0.01):
379
+ candidate = [o.strip().lower() for o in candidate]
380
+ correct = correct.strip().lower()
381
+
382
+ # normalize whitespace
383
+ candidate = "\n".join(candidate)
384
+ candidate = re.sub("\s+", " ", candidate).strip()
385
+ correct = re.sub("\s+", " ", correct).strip()
386
+
387
+ # split into individual tokens
388
+ candidate = candidate.split(" ")
389
+ correct = correct.split(" ")
390
+
391
+ # some tests may be truncated, if we allow this we don't enforce equal length of inputs/outputs
392
+ if not truncate_output:
393
+ if not len(candidate) == len(correct):
394
+ return False
395
+
396
+ # if we allow truncated io, the last token of the output may have been corrupted
397
+ if truncate_output:
398
+ correct = correct[:-1]
399
+
400
+ # when zip is used for lists of unequal length it will give as many pairs as there are items in the shorter list
401
+ for left, right in zip(candidate, correct):
402
+ if left == right:
403
+ continue
404
+
405
+ try:
406
+ int_left = int(left)
407
+ int_right = int(right)
408
+ if int_left == int_right:
409
+ continue
410
+ except ValueError:
411
+ pass
412
+
413
+ try:
414
+ float_left = float(left)
415
+ float_right = float(right)
416
+ if np.abs(float_left - float_right) < floating_point_accuracy:
417
+ continue
418
+ except ValueError:
419
+ pass
420
+
421
+ return False
422
+
423
+ return True
424
+
425
+
426
+ def call_method(method, inputs):
427
+ if isinstance(inputs, list):
428
+ inputs = "\n".join(inputs)
429
+
430
+ inputs_line_iterator = iter(inputs.split("\n"))
431
+
432
+ # sys.setrecursionlimit(10000)
433
+
434
+ # @patch('builtins.input', side_effect=inputs.split("\n"))
435
+ @patch("builtins.open", mock_open(read_data=inputs))
436
+ @patch("sys.stdin", StringIO(inputs))
437
+ @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
438
+ @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
439
+ @patch("sys.stdin.read", lambda *args: inputs)
440
+ # @patch('sys.stdout.write', print)
441
+ def _inner_call_method(_method):
442
+ try:
443
+ return _method()
444
+ except SystemExit as e:
445
+ pass
446
+ finally:
447
+ pass
448
+
449
+ return _inner_call_method(method)
src/evaluation/testing_utils_leetcode.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is based heavily on the huggingface APPS metric
2
+ # to run the solution files we're using a timing based approach
3
+ # for capturing the stdout
4
+ # used for testing the code that reads from input
5
+ import logging
6
+ import re
7
+ from subprocess import Popen, PIPE, TimeoutExpired
8
+ from typing import List, Tuple
9
+ import threading
10
+
11
+ log = logging.getLogger(__name__)
12
+ lock = threading.Lock()
13
+
14
+ def evaluate_solution_for_problem(
15
+ candidate_solution,
16
+ python_stub,
17
+ hidden_tests_io=None,
18
+ public_tests_io=None,
19
+ timeout=10,
20
+ debug=False,
21
+ add_extra_imports=False,
22
+ ):
23
+ with lock:
24
+ """See the readme for the output format of this function."""
25
+ if hidden_tests_io is None:
26
+ hidden_tests_io = []
27
+ if public_tests_io is None:
28
+ public_tests_io = []
29
+
30
+ if candidate_solution is None:
31
+ results_dict = {
32
+ "compilation_status": False,
33
+ "compilation_error_message": "No code was provided.",
34
+ "timeout_error": False,
35
+ "hidden_tests_results": [
36
+ {
37
+ "status": False,
38
+ "error_message": "No code was provided.",
39
+ "generated_output": None,
40
+ "input": test[0],
41
+ "expected_output": test[1],
42
+ }
43
+ for test in hidden_tests_io
44
+ ],
45
+ "public_tests_results": [
46
+ {
47
+ "status": False,
48
+ "error_message": "No code was provided.",
49
+ "generated_output": None,
50
+ "input": test[0],
51
+ "expected_output": test[1],
52
+ }
53
+ for test in public_tests_io
54
+ ],
55
+ }
56
+ return results_dict
57
+
58
+ hidden_tests_results = check_correctness(
59
+ candidate_solution, python_stub, hidden_tests_io, timeout, debug, add_extra_imports
60
+ )
61
+ public_tests_results = check_correctness(
62
+ candidate_solution, python_stub, public_tests_io, timeout, debug, add_extra_imports
63
+ )
64
+
65
+ # the compilation status shouldn't depend on the tests
66
+ if len(hidden_tests_io) > 0 and len(public_tests_io) > 0:
67
+ assert hidden_tests_results["compilation_status"] == public_tests_results["compilation_status"]
68
+
69
+ compilation_status = True
70
+ error_message = None
71
+ timeout_error = False
72
+
73
+ if len(hidden_tests_io) > 0:
74
+ compilation_status = compilation_status and hidden_tests_results["compilation_status"]
75
+ error_message = hidden_tests_results["error_message"]
76
+ timeout_error = timeout_error or hidden_tests_results["timeout_error"]
77
+
78
+ if len(public_tests_io) > 0:
79
+ compilation_status = compilation_status and public_tests_results["compilation_status"]
80
+ error_message = public_tests_results["error_message"]
81
+ timeout_error = timeout_error or public_tests_results["timeout_error"]
82
+
83
+ results_dict = {
84
+ "compilation_status": compilation_status,
85
+ "compilation_error_message": error_message,
86
+ "timeout_error": timeout_error,
87
+ "hidden_tests_results": hidden_tests_results["results"],
88
+ "public_tests_results": public_tests_results["results"],
89
+ }
90
+
91
+ return results_dict
92
+
93
+
94
+ def check_correctness(
95
+ candidate_solution: str,
96
+ python_stub: str,
97
+ tests: List[Tuple[List[str], str]],
98
+ timeout: int = 6000,
99
+ debug=True,
100
+ add_extra_imports=False,
101
+ ):
102
+ compilation_status = True
103
+ compilation_error = None
104
+ results = []
105
+ timeout_occurred = False
106
+
107
+ for idx, test in enumerate(tests):
108
+ inp, out, expl = test
109
+ result = one_test(
110
+ candidate_solution, python_stub, inp, out, timeout=timeout, debug=debug, add_extra_imports=add_extra_imports
111
+ )
112
+ error_message = result["error_message"]
113
+
114
+ if error_message is not None:
115
+ if "syntaxerror" in error_message.lower():
116
+ compilation_status = False
117
+ compilation_error = error_message
118
+ if "timeout" in error_message.lower():
119
+ timeout_occurred = True
120
+ results.append(result)
121
+
122
+ if timeout_occurred:
123
+ break
124
+
125
+ if timeout_occurred:
126
+ return {
127
+ "compilation_status": True,
128
+ "timeout_error": True,
129
+ "error_message": "Timeout error.",
130
+ "results": results,
131
+ }
132
+
133
+ return {
134
+ "compilation_status": compilation_status,
135
+ "timeout_error": False,
136
+ "error_message": compilation_error,
137
+ "results": results,
138
+ }
139
+
140
+
141
+ def one_test(candidate_solution, python_stub, inp, out, timeout=10, debug=False, add_extra_imports=False):
142
+ python_stub = python_stub.strip()
143
+ candidate_solution = candidate_solution.strip()
144
+
145
+ out = out.replace("null", "None").replace("true", "True").replace("false", "False")
146
+
147
+ # reformat the solution and parse class and method name
148
+ class_def, signature = python_stub.split(" def ")
149
+ class_name = class_def.split("class ")[1].strip().rstrip(":")
150
+ func_name, _ = signature.split("(")
151
+
152
+ # reformatting the input
153
+ first_param = r"^\w+\s\=\s"
154
+ later_params = r",\s\w+\s\=\s"
155
+
156
+ inp = re.sub(first_param, "", inp)
157
+ inp = re.sub(later_params, ", ", inp)
158
+
159
+ # we add custom code to invoke the solution
160
+ before_output = "AFTER THIS COMES OUR OWN GENERATED OUTPUT !@#!@!"
161
+ after_output = "AFTER THIS COMES OUR VERDICT !@#!@!"
162
+
163
+ if add_extra_imports:
164
+ sol = f"""
165
+ from collections import *
166
+ from math import *
167
+ import math
168
+ from functools import *
169
+ from heapq import *
170
+ import heapq
171
+ import itertools
172
+ from itertools import *
173
+ import bisect
174
+ from bisect import *
175
+ """
176
+ else:
177
+ sol = ""
178
+
179
+ sol += f"""
180
+ from typing import List, Tuple, Optional
181
+ {candidate_solution}
182
+ sfohsdfdsfjhsdkfjhsdkjfh = {class_name}()
183
+ res = sfohsdfdsfjhsdkfjhsdkjfh.{func_name}({inp})
184
+
185
+ def nested_list_convert(inp):
186
+ try:
187
+ try:
188
+ inp = list(inp)
189
+ except BaseException as e:
190
+ return inp
191
+ out = []
192
+ for i in inp:
193
+ out.append(nested_list_convert(i))
194
+ except BaseException as e:
195
+ return inp
196
+ return out
197
+
198
+ matching = False
199
+ matching = matching or res == {out}
200
+ matching = matching or nested_list_convert(res) == {out}
201
+ matching = matching or nested_list_convert(res) == nested_list_convert({out})
202
+ matching = matching or str({out})==str(res).replace("{{","[").replace("(","[").replace("}}","]").replace(")","]")
203
+ matching = matching or str({out})==str(res).replace("{{","[").replace("(","[").replace("}}","]").replace(")","]")
204
+ print("res: ", res)
205
+ print("out: ", {out})
206
+ print("{before_output}")
207
+ print(res)
208
+ print("{after_output}")
209
+ print(matching)
210
+ """
211
+
212
+ cmd = "python3"
213
+
214
+ proc = Popen([cmd, "-c", sol], stdin=PIPE, stdout=PIPE, stderr=PIPE)
215
+
216
+ result_object = {"input": inp, "expected_output": out.strip('"')}
217
+
218
+ try:
219
+ stdout, stderr = proc.communicate("", timeout=timeout)
220
+ except TimeoutExpired as e:
221
+ if debug:
222
+ log.info(f"Timeout error, timeout={timeout}")
223
+ result_object.update({"status": False, "error_message": "Timeout error.", "generated_output": None})
224
+ return result_object
225
+
226
+ finally:
227
+ proc.kill()
228
+
229
+ stdout = stdout.decode()
230
+ stderr = stderr.decode().lower()
231
+
232
+ if stderr == "":
233
+ # No compilation or runtime error
234
+ stderr = None
235
+ else:
236
+ # Runtime or compilation error (distinction is made by the presence of "syntaxerror" in the error message)
237
+ result_object.update(**{"status": False, "error_message": stderr, "generated_output": None})
238
+ return result_object
239
+
240
+ try:
241
+ generated_output = stdout.split(before_output)[1]
242
+ generated_output, verdict = generated_output.split(after_output)
243
+ result_object.update(
244
+ **{
245
+ "status": verdict.strip() == "True",
246
+ "error_message": stderr,
247
+ "generated_output": generated_output.strip(),
248
+ }
249
+ )
250
+ return result_object
251
+ except IndexError as e:
252
+ raise Exception(f"An unexpected error has occurred while parsing the following generated output: {stdout}")
253
+ # Used in debugging
254
+ # log.info(e)
255
+ # result_object.update(
256
+ # **{"status": False, "error_message": "The output couldn't be parsed", "generated_output": None}
257
+ # )
258
+ # return result_object