Be more tolerant of workflow errors, especially when on-error=continue (#263)

tetron · web-flow · commit b4585d207cd6 · 2017-01-05T09:40:51.000-05:00
* Be more tolerant of workflow errors, especially when on-error=continue
* Can return partial results with --on-error=continue
* Add tests for partial output after failure.
diff --git a/cwltool/factory.py b/cwltool/factory.py
@@ -3,10 +3,17 @@
 from . import workflow
 import os
 from .process import Process
-from typing import Any, Text, Union
+from typing import Any, Text, Union, Tuple
 from typing import Callable as tCallable
 import argparse
 
+class WorkflowStatus(Exception):
+    def __init__(self, out, status):
+        # type: (Dict[Text,Any], Text) -> None
+        super(WorkflowStatus, self).__init__("Completed %s" % status)
+        self.out = out
+        self.status = status
+
 class Callable(object):
     def __init__(self, t, factory):  # type: (Process, Factory) -> None
         self.t = t
@@ -16,13 +23,17 @@ def __call__(self, **kwargs):
         # type: (**Any) -> Union[Text, Dict[Text, Text]]
         execkwargs = self.factory.execkwargs.copy()
         execkwargs["basedir"] = os.getcwd()
-        return self.factory.executor(self.t, kwargs, **execkwargs)
+        out, status = self.factory.executor(self.t, kwargs, **execkwargs)
+        if status != "success":
+            raise WorkflowStatus(out, status)
+        else:
+            return out
 
 class Factory(object):
     def __init__(self, makeTool=workflow.defaultMakeTool,
                  executor=main.single_job_executor,
                  **execkwargs):
-        # type: (tCallable[[Dict[Text, Any], Any], Process],tCallable[...,Union[Text,Dict[Text,Text]]], **Any) -> None
+        # type: (tCallable[[Dict[Text, Any], Any], Process],tCallable[...,Tuple[Dict[Text,Any], Text]], **Any) -> None
         self.makeTool = makeTool
         self.executor = executor
         self.execkwargs = execkwargs
diff --git a/cwltool/job.py b/cwltool/job.py
@@ -314,7 +314,7 @@ def linkoutdir(src, tgt):
         if processStatus != "success":
             _logger.warn(u"[job %s] completed %s", self.name, processStatus)
         else:
-            _logger.debug(u"[job %s] completed %s", self.name, processStatus)
+            _logger.info(u"[job %s] completed %s", self.name, processStatus)
 
         if _logger.isEnabledFor(logging.DEBUG):
             _logger.debug(u"[job %s] %s", self.name, json.dumps(outputs, indent=4))
diff --git a/cwltool/main.py b/cwltool/main.py
@@ -165,9 +165,9 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         help="Will be passed to `docker run` as the '--net' "
                         "parameter. Implies '--enable-net'.")
 
-    parser.add_argument("--on-error", type=Text,
+    parser.add_argument("--on-error", type=str,
                         help="Desired workflow behavior when a step fails.  One of 'stop' or 'continue'. "
-                        "Default is 'stop.", default="stop")
+                        "Default is 'stop'.", default="stop", choices=("stop", "continue"))
 
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--compute-checksum", action="store_true", default=True,
@@ -187,16 +187,12 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
 
 
 def single_job_executor(t, job_order_object, **kwargs):
-    # type: (Process, Dict[Text, Any], **Any) -> Union[Text, Dict[Text, Text]]
+    # type: (Process, Dict[Text, Any], **Any) -> Tuple[Dict[Text, Any], Text]
     final_output = []
     final_status = []
 
     def output_callback(out, processStatus):
         final_status.append(processStatus)
-        if processStatus == "success":
-            _logger.info(u"Final process status is %s", processStatus)
-        else:
-            _logger.warn(u"Final process status is %s", processStatus)
         final_output.append(out)
 
     if "basedir" not in kwargs:
@@ -223,30 +219,30 @@ def output_callback(out, processStatus):
 
     try:
         for r in jobiter:
-            if r.outdir:
-                output_dirs.add(r.outdir)
-
             if r:
+                if r.outdir:
+                    output_dirs.add(r.outdir)
                 r.run(**kwargs)
             else:
-                raise WorkflowException("Workflow cannot make any more progress.")
+                _logger.error("Workflow cannot make any more progress.")
+                break
     except WorkflowException:
         raise
     except Exception as e:
         _logger.exception("Got workflow error")
         raise WorkflowException(Text(e))
 
-    if final_status[0] != "success":
-        raise WorkflowException(u"Process status is %s" % (final_status))
-
-    if final_output[0] and finaloutdir:
+    if final_output and final_output[0] and finaloutdir:
         final_output[0] = relocateOutputs(final_output[0], finaloutdir,
                                           output_dirs, kwargs.get("move_outputs"))
 
     if kwargs.get("rm_tmpdir"):
         cleanIntermediate(output_dirs)
 
-    return final_output[0]
+    if final_output and final_status:
+        return (final_output[0], final_status[0])
+    else:
+        return (None, "permanentFail")
 
 class FSAction(argparse.Action):
     objclass = None  # type: Text
@@ -551,7 +547,7 @@ def versionstring():
 
 def main(argsl=None,  # type: List[str]
          args=None,   # type: argparse.Namespace
-         executor=single_job_executor,  # type: Callable[..., Union[Text, Dict[Text, Text]]]
+         executor=single_job_executor,  # type: Callable[..., Tuple[Dict[Text, Any], Text]]
          makeTool=workflow.defaultMakeTool,  # type: Callable[..., Process]
          selectResources=None,  # type: Callable[[Dict[Text, int]], Dict[Text, int]]
          stdin=sys.stdin,  # type: IO[Any]
@@ -714,7 +710,7 @@ def main(argsl=None,  # type: List[str]
             setattr(args, 'basedir', job_order_object[1])
             del args.workflow
             del args.job_order
-            out = executor(tool, job_order_object[0],
+            (out, status) = executor(tool, job_order_object[0],
                            makeTool=makeTool,
                            select_resources=selectResources,
                            make_fs_access=make_fs_access,
@@ -735,8 +731,14 @@ def locToPath(p):
                     stdout.write(json.dumps(out, indent=4))
                 stdout.write("\n")
                 stdout.flush()
-            else:
+
+            if status != "success":
+                _logger.warn(u"Final process status is %s", status)
                 return 1
+            else:
+                _logger.info(u"Final process status is %s", status)
+                return 0
+
         except (validate.ValidationException) as exc:
             _logger.error(u"Input object failed validation:\n%s", exc,
                     exc_info=args.debug)
diff --git a/cwltool/workflow.py b/cwltool/workflow.py
@@ -141,8 +141,8 @@ def _rec_fields(rec):  # type: (Dict[Text, Any]) -> Dict[Text, Any]
             return False
     return True
 
-def object_from_state(state, parms, frag_only, supportsMultipleInput, sourceField):
-    # type: (Dict[Text, WorkflowStateItem], List[Dict[Text, Any]], bool, bool, Text) -> Dict[Text, Any]
+def object_from_state(state, parms, frag_only, supportsMultipleInput, sourceField, incomplete=False):
+    # type: (Dict[Text, WorkflowStateItem], List[Dict[Text, Any]], bool, bool, Text, bool) -> Dict[Text, Any]
     inputobj = {}  # type: Dict[Text, Any]
     for inp in parms:
         iid = inp["id"]
@@ -172,7 +172,7 @@ def object_from_state(state, parms, frag_only, supportsMultipleInput, sourceFiel
                     raise WorkflowException(
                         u"Connect source '%s' on parameter '%s' does not "
                         "exist" % (src, inp["id"]))
-                else:
+                elif not incomplete:
                     return None
         elif "default" in inp:
             inputobj[iid] = inp["default"]
@@ -225,12 +225,13 @@ def __init__(self, workflow, **kwargs):
 
     def receive_output(self, step, outputparms, jobout, processStatus):
         # type: (WorkflowJobStep, List[Dict[Text,Text]], Dict[Text,Text], Text) -> None
+
         for i in outputparms:
             if "id" in i:
                 if i["id"] in jobout:
                     self.state[i["id"]] = WorkflowStateItem(i, jobout[i["id"]])
                 else:
-                    _logger.error(u"Output is missing expected field %s" % i["id"])
+                    _logger.error(u"[%s] Output is missing expected field %s", step.name, i["id"])
                     processStatus = "permanentFail"
 
         if _logger.isEnabledFor(logging.DEBUG):
@@ -240,9 +241,9 @@ def receive_output(self, step, outputparms, jobout, processStatus):
             if self.processStatus != "permanentFail":
                 self.processStatus = processStatus
 
-            _logger.warn(u"[%s] completion status is %s", step.name, processStatus)
+            _logger.warn(u"[%s] completed %s", step.name, processStatus)
         else:
-            _logger.info(u"[%s] completion status is %s", step.name, processStatus)
+            _logger.info(u"[%s] completed %s", step.name, processStatus)
 
         step.completed = True
 
@@ -363,37 +364,52 @@ def job(self, joborder, output_callback, **kwargs):
                 self.state[out["id"]] = None
 
         completed = 0
-        while completed < len(self.steps) and self.processStatus == "success":
+        while completed < len(self.steps):
             made_progress = False
 
             for step in self.steps:
                 if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success":
                     break
 
                 if not step.submitted:
-                    step.iterable = self.try_make_job(step, **kwargs)
+                    try:
+                        step.iterable = self.try_make_job(step, **kwargs)
+                    except WorkflowException as e:
+                        _logger.error(u"[%s] Cannot make job: %s", step.name, e)
+                        _logger.debug("", exc_info=True)
+                        self.processStatus = "permanentFail"
 
                 if step.iterable:
-                    for newjob in step.iterable:
-                        if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success":
-                            break
-                        if newjob:
-                            made_progress = True
-                            yield newjob
-                        else:
-                            break
+                    try:
+                        for newjob in step.iterable:
+                            if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success":
+                                break
+                            if newjob:
+                                made_progress = True
+                                yield newjob
+                            else:
+                                break
+                    except WorkflowException as e:
+                        _logger.error(u"[%s] Cannot make job: %s", step.name, e)
+                        _logger.debug("", exc_info=True)
+                        self.processStatus = "permanentFail"
 
             completed = sum(1 for s in self.steps if s.completed)
 
             if not made_progress and completed < len(self.steps):
-                yield None
+                if self.processStatus != "success":
+                    break
+                else:
+                    yield None
 
         supportsMultipleInput = bool(self.workflow.get_requirement("MultipleInputFeatureRequirement")[0])
 
-        wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput, "outputSource")
-
-        if wo is None:
-            raise WorkflowException("Output for workflow not available")
+        try:
+            wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput, "outputSource", incomplete=True)
+        except WorkflowException as e:
+            _logger.error(u"[%s] Cannot collect workflow output: %s", self.name, e)
+            wo = {}
+            self.processStatus = "permanentFail"
 
         _logger.info(u"[%s] outdir is %s", self.name, self.outdir)
 
@@ -591,17 +607,23 @@ def setTotal(self, total):  # type: (int) -> None
 def parallel_steps(steps, rc, kwargs):  # type: (List[Generator], ReceiveScatterOutput, Dict[str, Any]) -> Generator
     while rc.completed < rc.total:
         made_progress = False
-        for step in steps:
+        for index in xrange(len(steps)):
+            step = steps[index]
             if kwargs.get("on_error", "stop") == "stop" and rc.processStatus != "success":
                 break
-            for j in step:
-                if kwargs.get("on_error", "stop") == "stop" and rc.processStatus != "success":
-                    break
-                if j:
-                    made_progress = True
-                    yield j
-                else:
-                    break
+            try:
+                for j in step:
+                    if kwargs.get("on_error", "stop") == "stop" and rc.processStatus != "success":
+                        break
+                    if j:
+                        made_progress = True
+                        yield j
+                    else:
+                        break
+            except WorkflowException as e:
+                _logger.error(u"Cannot make scatter job: %s", e)
+                _logger.debug("", exc_info=True)
+                rc.receive_scatter_output(index, {}, "permanentFail")
         if not made_progress and rc.completed < rc.total:
             yield None
 
diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -117,6 +117,29 @@ def test_factory(self):
         echo = f.make("tests/echo.cwl")
         self.assertEqual(echo(inp="foo"), {"out": "foo\n"})
 
+    def test_partial_scatter(self):
+        f = cwltool.factory.Factory(on_error="continue")
+        fail = f.make("tests/wf/scatterfail.cwl")
+        try:
+            fail()
+        except cwltool.factory.WorkflowStatus as e:
+            self.assertEquals('sha1$e5fa44f2b31c1fb553b6021e7360d07d5d91ff5e', e.out["out"][0]["checksum"])
+            self.assertIsNone(e.out["out"][1])
+            self.assertEquals('sha1$a3db5c13ff90a36963278c6a39e4ee3c22e2a436', e.out["out"][2]["checksum"])
+        else:
+            self.fail("Should have raised WorkflowStatus")
+
+    def test_partial_output(self):
+        f = cwltool.factory.Factory(on_error="continue")
+        fail = f.make("tests/wf/wffail.cwl")
+        try:
+            fail()
+        except cwltool.factory.WorkflowStatus as e:
+            self.assertEquals('sha1$e5fa44f2b31c1fb553b6021e7360d07d5d91ff5e', e.out["out1"]["checksum"])
+            self.assertNotIn("out2", e.out)
+        else:
+            self.fail("Should have raised WorkflowStatus")
+
 class TestScanDeps(unittest.TestCase):
     def test_scandeps(self):
         obj = {
diff --git a/tests/wf/cat.cwl b/tests/wf/cat.cwl
@@ -0,0 +1,6 @@
+class: CommandLineTool
+cwlVersion: v1.0
+inputs:
+  r: File
+outputs: []
+arguments: [cat, $(inputs.r.path)]
diff --git a/tests/wf/echo.cwl b/tests/wf/echo.cwl
@@ -0,0 +1,20 @@
+class: CommandLineTool
+cwlVersion: v1.0
+inputs:
+  r: string
+  script:
+    type: string
+    default: |
+      import sys
+      print sys.argv[1]
+      if sys.argv[1] == "2":
+        exit(1)
+      else:
+        f = open("foo"+sys.argv[1]+".txt", "w")
+        f.write(sys.argv[1]+"\n")
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: foo$(inputs.r).txt
+arguments: [python, -c, $(inputs.script), $(inputs.r)]
diff --git a/tests/wf/scatterfail.cwl b/tests/wf/scatterfail.cwl
@@ -0,0 +1,25 @@
+class: Workflow
+cwlVersion: v1.0
+requirements:
+  ScatterFeatureRequirement: {}
+inputs:
+  range:
+    type: string[]
+    default: ["1", "2", "3"]
+outputs:
+  out:
+    type: File[]
+    outputSource: step1/out
+steps:
+  step1:
+    in:
+      r: range
+    scatter: r
+    out: [out]
+    run: echo.cwl
+  step2:
+    in:
+      r: step1/out
+    scatter: r
+    out: []
+    run: cat.cwl
diff --git a/tests/wf/wffail.cwl b/tests/wf/wffail.cwl