db/d12/trfGraph_8py_source.html

# Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration


import copy

import os


import logging

msg = logging.getLogger(__name__)


import PyJobTransforms.trfExceptions as trfExceptions


from PyJobTransforms.trfExitCodes import trfExit


class executorGraph(object):


    def __init__(self, executorSet, inputData = set([]), outputData = set([])):


        # Set basic node list

        self._nodeDict = {}


        msg.info('Transform graph input data: {0}; output data {1}'.format(inputData, outputData))


        if len(executorSet) == 1:

            # Single executor - in this case inData/outData is not mandatory, so we set them to the

            # input/output data of the transform

            executor = list(executorSet)[0]

            if len(executor._inData) == 0 and len(executor._outData) == 0:

                executor.inData = inputData

                executor.outData = outputData


        for executor in executorSet:

            self.addNode(executor)


        self._inputData = set(inputData)

        self._outputData = set(outputData)


        # It's forbidden for a transform to consume and produce the same datatype

        dataOverlap = self._inputData & self._outputData

        if len(dataOverlap) > 0:

            raise trfExceptions.TransformSetupException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                                                        'Transform definition error, you cannot produce and consume the same datatypes in a transform. Duplicated input/output types {0}.'.format(' '.join(dataOverlap)))


        # Add a pseudo-start/stop nodes, from which input data flows and output data finally arrives

        # This makes the graph 'concrete' for this job

        # This is useful as then data edges all connect properly to a pair of nodes

        # We add a node for every possible output as this enables topo sorting of the graph

        # nodes for any intermediate data end nodes as well

        pseudoNodes = dict()

        pseudoNodes['_start'] = graphNode(name='_start', inData=[], outData=self._inputData, weight = 0)

        for node in self._nodeDict.values():

            for dataType in node.outputDataTypes:

                endNodeName = '_end_{0}'.format(dataType)

                pseudoNodes[endNodeName] = graphNode(name=endNodeName, inData=[dataType], outData=[], weight = 0)

        self._nodeDict.update(pseudoNodes)


        # Toposort not yet done

        self._toposort = []

        self._toposortData = []


        # Now find connections between nodes

        self.findConnections()


    @property

    def inputData(self):

        return self._inputData


    @inputData.setter

    def inputData(self, inputData):

        self._inputData = set(inputData)


    @property

    def outputData(self):

        return self._outputData


    @outputData.setter

    def outputData(self, outputData):

        self._outputData = set(outputData)


    @property

    def execution(self):

        exeList = []

        for nodeName in self._toposort:

            # Start and end nodes are not real - they never actually execute

            if nodeName.startswith(('_start', '_end')):

                continue

            if self._execution[nodeName]['enabled'] is True:

                exeList.append({'name': nodeName, 'input': self._execution[nodeName]['input'],

                                'output': self._execution[nodeName]['output']})

        return exeList


    @property

    def data(self):

        dataset = set()

        for nodeName in self._toposort:

            # Start and end nodes are not real - they never actually execute

            if nodeName.startswith(('_start', '_end')):

                continue

            if self._execution[nodeName]['enabled'] is True:

                dataset.update(self._execution[nodeName]['input'])

                dataset.update(self._execution[nodeName]['output'])

        return dataset


    def addNode(self, executor):

        self._nodeDict[executor.name] = executorNode(executor)


    def deleteNote(self, executor):

        if executor.name in self._nodeDict:

            del(self._nodeDict[executor.name])


    def _resetConnections(self):

        for node in self._nodeDict.values():

            node.resetConnections()


    def findConnections(self):

        self._resetConnections()

        for nodeNameA, nodeA in self._nodeDict.items():

            for nodeNameB, nodeB in self._nodeDict.items():

                if nodeNameA == nodeNameB:

                    continue

                dataIntersection = list(set(nodeA.outputDataTypes) & set(nodeB.inputDataTypes))

                msg.debug('Data connections between {0} and {1}: {2}'.format(nodeNameA, nodeNameB, dataIntersection))

                if len(dataIntersection) > 0:

                    nodeA.addConnection(nodeNameB, dataIntersection, direction='out')

                    nodeB.addConnection(nodeNameA, dataIntersection, direction='in')


        msg.debug('Graph connections are: \n{0}'.format(self))


    def doToposort(self):

        # We will manipulate the graph, so deepcopy it

        graphCopy = copy.deepcopy(self._nodeDict)

        # Find all valid start nodes in this graph - ones with no data dependencies themselves

        startNodeNames = []

        for nodeName, node in graphCopy.items():

            if len(node.connections['in']) == 0:

                startNodeNames.append(nodeName)


        if len(startNodeNames) == 0:

            raise trfExceptions.TransformGraphException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                                                        'There are no starting nodes in this graph - non-DAG graphs are not supported')


        msg.debug('Found this list of start nodes for toposort: {0}'.format(startNodeNames))


        # The startNodeNames holds the list of nodes with their dependencies now satisfied (no input edges anymore)

        while len(startNodeNames) > 0:

            # Take the next startNodeName and zap it from the graph

            theNodeName = startNodeNames.pop()

            theNode = graphCopy[theNodeName]

            self._toposort.append(theNodeName)

            del graphCopy[theNodeName]


            # Now delete the edges this node was a source for

            msg.debug('Considering connections from node {0}'.format(theNodeName))

            for connectedNodeName in theNode.connections['out']:

                graphCopy[connectedNodeName].delConnection(toExe = theNodeName, direction = 'in')

                # Look for nodes which now have their dependencies satisfied

                if len(graphCopy[connectedNodeName].connections['in']) == 0:

                    startNodeNames.append(connectedNodeName)


        # If there are nodes left then the graph has cycles, which means it's not a DAG

        if len(graphCopy) > 0:

            raise trfExceptions.TransformGraphException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                                                        'Graph topological sort had no more start nodes, but nodes were left {0} - non-DAG graphs are not supported'.format(list(graphCopy)))


        msg.debug('Topologically sorted node order: {0}'.format(self._toposort))


        # Now toposort the input data for nodes

        self._toposortData = []

        for nodeName in self._toposort:

            # First add input data, then output data

            for dataType in self._nodeDict[nodeName].inputDataTypes:

                if dataType not in self._toposortData:

                    self._toposortData.append(dataType)

            for dataType in self._nodeDict[nodeName].outputDataTypes:

                if dataType not in self._toposortData:

                    self._toposortData.append(dataType)


        msg.debug('Topologically sorted data order: {0}'.format(self._toposortData))


    def findExecutionPath(self):

        # Switch off all nodes, except if we have a single node which is not data driven...

        self._execution = {}

        for nodeName, node in self._nodeDict.items():

            if len(self._nodeDict) == 1 and node.inputDataTypes == set() and node.inputDataTypes == set():

                self._execution[nodeName] = {'enabled' : True, 'input' : set(), 'output' : set()}

            else:

                self._execution[nodeName] = {'enabled' : False, 'input' : set(), 'output' : set()}


        dataToProduce = copy.deepcopy(self._outputData)

        dataAvailable = copy.deepcopy(self._inputData)


        # Consider the next data type in topo order

        while len(dataToProduce) > 0:

            nextDataType = None

            for dataType in self._toposortData:

                if dataType in dataToProduce:

                    nextDataType = dataType

                    dataToProduce.remove(nextDataType)

                    dataAvailable.update([nextDataType])

                    break


            if not nextDataType:

                msg.error('Still have to produce data type(s) {0}, but did not find anything in the toposorted data list ({1}).'

                          ' Transform parameters/graph are broken so aborting.'.format(dataToProduce, self._toposortData))

                raise trfExceptions.TransformGraphException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                                                            'Data type graph error')


            msg.debug('Next data type to try is {0}'.format(nextDataType))

            bestPath = self._bestPath(nextDataType, dataAvailable)


            msg.debug('Found best path for {0}: {1}'.format(nextDataType, bestPath))


            modPath = bestPath.path + [None]

            for (nodeName, nextNodeName) in [ (n, modPath[modPath.index(n)+1]) for n in bestPath.path ]:

                self._execution[nodeName]['enabled'] = True

                # Add the necessary data types to the output of the first node and the input of the next

                if nodeName in bestPath.newData:

                    self._execution[nodeName]['output'].update(bestPath.newData[nodeName])

                    for newData in bestPath.newData[nodeName]:

                        if newData not in dataAvailable:

                            dataToProduce.update([newData])

                if nextNodeName:

                    self._execution[nextNodeName]['input'].update(bestPath.newData[nodeName])

                    if nextNodeName in bestPath.extraData:

                        self._execution[nextNodeName]['input'].update(bestPath.extraData[nodeName])

                # Add any extra data we need (from multi-exit nodes) to the data to produce list

                for extraNodeData in bestPath.extraData.values():

                    for extra in extraNodeData:

                        if extra not in dataAvailable:

                            dataToProduce.update([extra])


        # Now remove the fake data objects from activated nodes

        for node, props in self._execution.items():

            msg.debug('Removing fake data from node {0}'.format(node))

            props['input'] -= set(['inNULL', 'outNULL'])

            props['output'] -= set(['inNULL', 'outNULL'])


        msg.debug('Execution dictionary: {0}'.format(self._execution))


    def _bestPath(self, data, dataAvailable, startNodeName = '_start', endNodeName = None):


        if endNodeName is None:

            endNodeName = '_end_{0}'.format(data)


        if endNodeName not in self._nodeDict:

            raise trfExceptions.TransformGraphException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                'Node {0} was not found - the transform data connection definition is broken'.format(endNodeName))


        # Set of all considered paths

        # Initialise this with our endNode name - algorithm works back to the start

        pathSet = [graphPath(endNodeName, data),]


        msg.debug('Started path finding with seed path {0}'.format(pathSet[0]))


        # Halting condition - only one path and its first element is startNodeName

        while len(pathSet) > 1 or pathSet[0].path[0] != startNodeName:

            msg.debug('Starting best path iteration with {0} paths in {1}'.format(len(pathSet), pathSet))

            # Copy the pathSet to do this, as we will update it

            for path in pathSet[:]:

                msg.debug('Continuing path finding with path {0}'.format(path))

                currentNodeName = path.path[0]

                if currentNodeName == startNodeName:

                    msg.debug('Path {0} has reached the start node - finished'.format(path))

                    continue

                # If there are no paths out of this node then it's a dead end - kill it

                if len(self._nodeDict[currentNodeName].connections['in']) == 0:

                    msg.debug('Path {0} is a dead end - removing'.format(path))

                    pathSet.remove(path)

                    continue

                # If there is only one path out of this node, we extend it

                if len(self._nodeDict[currentNodeName].connections['in']) == 1:

                    msg.debug('Single exit from path {0} - adding connection to {1}'.format(path, list(self._nodeDict[currentNodeName].connections['in'])[0]))

                    self._extendPath(path, currentNodeName, list(self._nodeDict[currentNodeName].connections['in'])[0])

                    continue

                # Else we need to clone the path for each possible exit

                msg.debug('Multiple exits from path {0} - will clone for each extra exit'.format([path]))

                for nextNodeName in list(self._nodeDict[currentNodeName].connections['in'])[1:]:

                    newPath = copy.deepcopy(path)

                    msg.debug('Cloned exit from path {0} to {1}'.format(newPath, nextNodeName))

                    self._extendPath(newPath, currentNodeName, nextNodeName)

                    pathSet.append(newPath)

                # Finally, use the original path to extend along the first node exit

                msg.debug('Adding exit from original path {0} to {1}'.format(path, list(self._nodeDict[currentNodeName].connections['in'])[0]))

                self._extendPath(path, currentNodeName, list(self._nodeDict[currentNodeName].connections['in'])[0])


            # Now compare paths which made it to the end - only keep the shortest

            lowestCostPath = None

            for path in pathSet[:]:

                currentNodeName = path.path[0]

                if currentNodeName == startNodeName:

                    if lowestCostPath is None:

                        lowestCostPath = path

                        continue

                    if path.cost >= lowestCostPath.cost:

                        msg.debug('Path {0} is no cheaper than best path {1} - removing'.format(path, lowestCostPath))

                        pathSet.remove(path)

                    else:

                        msg.debug('Path {0} is cheaper than previous best path {1} - removing previous'.format(path, lowestCostPath))

                        pathSet.remove(lowestCostPath)

                        lowestCostPath = path


            # Emergency break

            if len(pathSet) == 0:

                raise trfExceptions.TransformGraphException(trfExit.nameToCode('TRF_GRAPH_ERROR'),

                                                            'No path found between {0} and {1} for {2}'.format(startNodeName, endNodeName, data))

        return pathSet[0]


    def _extendPath(self, path, currentNodeName, nextNodeName):

        edgeData = self._nodeDict[currentNodeName].connections['in'][nextNodeName]

        msg.debug('Connecting {0} to {1} with data {2}'.format(currentNodeName, nextNodeName, edgeData))


        extraData = set()

        if self._execution[currentNodeName]['enabled'] is True:

            extraCost = 0

        else:

            for edgeDataElement in edgeData:

                # Simple case - one data connection only

                if edgeDataElement in self._nodeDict[currentNodeName].inData:

                    extraCost = self._nodeDict[currentNodeName].weights[edgeDataElement]

                else:

                    # Complex case - the start requirement for this node must be multi-data

                    # Only the first match in the dataIn lists is considered

                    # This will break if there are multiple overlapping dataIn requirements

                    for nodeStartData in self._nodeDict[currentNodeName].inData:

                        if isinstance(nodeStartData, (list, tuple)) and edgeDataElement in nodeStartData:

                            extraCost = self._nodeDict[currentNodeName].weights[nodeStartData]

                            msg.debug('Found multi-data exit from {0} to {1} - adding {2} to data requirements'.format(currentNodeName, nextNodeName, nodeStartData))

                            extraData.update(nodeStartData)

                            break

            # Remove data which is on the edge itself

            extraData.difference_update(edgeData)


        msg.debug('Updating path {0} with {1}, {2}, {3}, {4}'.format(path, nextNodeName, edgeData, extraData, extraCost))

        path.addToPath(nextNodeName, edgeData, extraData, extraCost)


    def __str__(self):

        nodeStrList = []

        if len(self._toposort) > 0:

            nodeNames = self._toposort

        else:

            nodeNames = list(self._nodeDict)

            nodeNames.sort()

        for nodeName in nodeNames:

            if not nodeName.startswith('_'):

                nodeStrList.append(str(self._nodeDict[nodeName]))

        return os.linesep.join(nodeStrList)


    def __repr__(self):

        nodeStrList = []

        if len(self._toposort) > 0:

            nodeNames = self._toposort

        else:

            nodeNames = list(self._nodeDict)

            nodeNames.sort()

        for nodeName in nodeNames:

            nodeStrList.append(repr(self._nodeDict[nodeName]))

        return os.linesep.join(nodeStrList)


class graphNode(object):


    def __init__(self, name, inData, outData, weight = None):

        self._name = name

        self._inData = set(inData)

        self._outData = set(outData)


        self._inWeights = {}

        if weight is None:

            for data in self._inData:

                self._inWeights[data] = 1

        elif isinstance(weight, int):

            for data in self._inData:

                self._inWeights[data] = weight

        else:

            # Must be a dictionary with its keys equal to the _inData elements

            self._inWeights = weight


        self._inputDataTypes = self._flattenSet(self._inData)

        self._outputDataTypes = self._flattenSet(self._outData)


        # Connections dictionary will hold incoming and outgoing edges - the incoming connections

        # are very useful for topological ordering. Nested dictionary with 'in', 'out' keys, where

        # the values are dictionaries with nodeName keys and set(dataTypes) as values.

        # e.g., {'out': {'_end_HIST': set(['HIST'])}, 'in': {'ESDtoAOD': set(['HIST_AOD']), 'RAWtoESD': set(['HIST_ESD'])}}

        self._connections = {'in': {}, 'out': {}}


    @property

    def name(self):

        return self._name


    @property

    def inData(self):

        return self._inData


    @property

    def outData(self):

        return self._outData


    @property

    def inputDataTypes(self):

        return self._flattenSet(self.inData)


    @property

    def outputDataTypes(self):

        return self._flattenSet(self._outData)


    @property

    def connections(self):

        return self._connections


    @property

    def weights(self):

        return self._inWeights


    def addConnection(self, toExe, data, direction = 'out'):

        self._connections[direction][toExe] = set(data)


    def delConnection(self, toExe, direction = 'out'):

        del self._connections[direction][toExe]


    def resetConnections(self):

        self._connections = {'in': {}, 'out': {}}


    def _flattenSet(self, startSet):

        flatData = set()

        for data in startSet:

            if isinstance(data, (list, tuple)):

                flatData.update(data)

            else:

                flatData.update([data])

        return flatData


    def __str__(self):

        return '{0} (dataIn {1} -> dataOut {2})'.format(self._name, self._inData, self._outData)


    def __repr__(self):

        return '{0} (dataIn {1}, weights {2}; dataOut {3}; connect {4})'.format(self._name, self._inData, self._inWeights, self._outData, self._connections)


class executorNode(graphNode):


    def __init__(self, executor = None, weight = None):

        super(executorNode, self).__init__(executor.name, executor.inData, executor.outData, weight)


class graphPath(object):


    def __init__(self, endNodeName, data, cost = 0):

        self._path = [endNodeName]

        self._data = data

        self._cost = cost


        self._newData = dict()

        self._extraData = dict()


    @property

    def path(self):

        return self._path


    @property

    def cost(self):

        return self._cost


    @property

    def newData(self):

        return self._newData


    @property

    def extraData(self):

        return self._extraData


    def addToPath(self, newNodeName, newData = set(), extraData = set(), extraCost = 0):

        self._path.insert(0, newNodeName)

        self._newData[newNodeName] = newData

        self._cost += extraCost

        self._extraData[newNodeName] = extraData


    def addCost(self, cost):

        self._cost += cost


    def __str__(self):

        return '{0}: path {1}; cost {2}, newData {3}, extraData {4}'.format(self._data, self._path, self._cost, self._newData, self._extraData)