Spaces:

hage2000
/

code_eval_stdio

Sleeping

App Files Files Community

Kewen Zhao commited on Nov 22, 2024

Commit

af3f724

1 Parent(s): 47feec4

stdio format

Browse files

Files changed (2) hide show

code_eval_stdio.py +3 -4
execute.py +45 -35

code_eval_stdio.py CHANGED Viewed

@@ -152,7 +152,7 @@ class CodeEval(evaluate.Metric):
             license=_LICENSE,
         )
-    def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
         """Returns the scores"""
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
@@ -167,10 +167,9 @@ class CodeEval(evaluate.Metric):
             n_samples = 0
             results = defaultdict(list)
-            for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
                 for candidate in candidates:
-                    test_program = candidate + "\n" + test_case
-                    args = (test_program, timeout, task_id, completion_id[task_id])
                     future = executor.submit(check_correctness, *args)
                     futures.append(future)
                     completion_id[task_id] += 1

             license=_LICENSE,
         )
+    def _compute(self, program, test_input, test_output, k=[1, 10, 100], num_workers=4, timeout=3.0):
         """Returns the scores"""
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
             n_samples = 0
             results = defaultdict(list)
+            for task_id, (candidates, inputs, outputs) in enumerate(zip(program, test_input, test_output)):
                 for candidate in candidates:
+                    args = (candidate, inputs, outputs, timeout, task_id, completion_id[task_id])
                     future = executor.submit(check_correctness, *args)
                     futures.append(future)
                     completion_id[task_id] += 1

execute.py CHANGED Viewed

@@ -25,24 +25,27 @@ import signal
 import tempfile
-def check_correctness(check_program, timeout, task_id, completion_id):
     """
     Evaluates the functional correctness of a completion by running the test
     suite provided in the problem.
-    :param completion_id: an optional completion ID so we can match
-        the results later even if execution finishes asynchronously.
     """
     manager = multiprocessing.Manager()
     result = manager.list()
-    p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
-    p.start()
-    p.join(timeout=timeout + 1)
-    if p.is_alive():
-        p.kill()
-    if not result:
         result.append("timed out")
     return dict(
@@ -53,8 +56,16 @@ def check_correctness(check_program, timeout, task_id, completion_id):
     )
-def unsafe_execute(check_program, result, timeout):
     with create_tempdir():
         # These system calls are needed when cleaning up tempdir.
@@ -71,10 +82,24 @@ def unsafe_execute(check_program, result, timeout):
         # Run program.
         try:
             exec_globals = {}
-            with swallow_io():
                 with time_limit(timeout):
-                    exec(check_program, exec_globals)
-            result.append("passed")
         except TimeoutException:
             result.append("timed out")
         except BaseException as e:
@@ -100,11 +125,13 @@ def time_limit(seconds):
 @contextlib.contextmanager
-def swallow_io():
-    stream = WriteOnlyStringIO()
-    with contextlib.redirect_stdout(stream):
-        with contextlib.redirect_stderr(stream):
-            with redirect_stdin(stream):
                 yield
@@ -119,23 +146,6 @@ class TimeoutException(Exception):
     pass
-class WriteOnlyStringIO(io.StringIO):
-    """StringIO that throws an exception when it's read from"""
-    def read(self, *args, **kwargs):
-        raise OSError
-    def readline(self, *args, **kwargs):
-        raise OSError
-    def readlines(self, *args, **kwargs):
-        raise OSError
-    def readable(self, *args, **kwargs):
-        """Returns True if the IO object can be read."""
-        return False
 class redirect_stdin(contextlib._RedirectStream):  # type: ignore
     _stream = "stdin"

 import tempfile
+def check_correctness(program, test_input, test_output, timeout, task_id, completion_id):
     """
     Evaluates the functional correctness of a completion by running the test
     suite provided in the problem.
+    :param program: The program string to evaluate.
+    :param test_input: The input string to provide to the program via STDIN.
+    :param test_output: The expected output string from the program via STDOUT.
+    :param timeout: Maximum execution time in seconds.
+    :param task_id: ID of the task being evaluated.
+    :param completion_id: Completion ID to match results later.
     """
     manager = multiprocessing.Manager()
     result = manager.list()
+    process = multiprocessing.Process(target=unsafe_execute, args=(program, test_input, test_output, result, timeout))
+    process.start()
+    process.join(timeout=timeout + 1)
+    if process.is_alive():
+        process.kill()
         result.append("timed out")
     return dict(
     )
+def unsafe_execute(program, test_input, test_output, result, timeout):
+    """
+    Executes the program with redirected STDIN and compares its STDOUT to the expected output.
+    :param program: The program string to execute.
+    :param test_input: Input to provide to the program via STDIN.
+    :param test_output: Expected output to compare with STDOUT.
+    :param result: A multiprocessing.Manager().list() to store the execution result.
+    :param timeout: Maximum execution time in seconds.
+    """
     with create_tempdir():
         # These system calls are needed when cleaning up tempdir.
         # Run program.
         try:
             exec_globals = {}
+            actual_output = None
+            # Redirect I/O and execute
+            input_stream = io.StringIO(test_input)
+            output_stream = io.StringIO()
+            with swallow_io(input_stream, output_stream):
                 with time_limit(timeout):
+                    exec(program, exec_globals)
+            actual_output = output_stream.getvalue().strip()
+            expected_output = test_output.strip()
+            if actual_output == expected_output:
+                result.append("passed")
+            else:
+                result.append(f"failed: got '{actual_output}', expected '{expected_output}'")
         except TimeoutException:
             result.append("timed out")
         except BaseException as e:
 @contextlib.contextmanager
+def swallow_io(input_stream, output_stream):
+    """
+    Redirects STDIN, STDOUT, and STDERR for isolated execution.
+    """
+    with contextlib.redirect_stdout(output_stream):
+        with contextlib.redirect_stderr(output_stream):
+            with redirect_stdin(input_stream):
                 yield
     pass
 class redirect_stdin(contextlib._RedirectStream):  # type: ignore
     _stream = "stdin"