Avoid computing gradients for incidental nodes (#238)

2025-03-29 17:15:21 +01:00 · 2019-04-11 20:17:19 +02:00 · 2019-04-11 20:17:19 +02:00 · c0f87dc0bc
commit c0f87dc0bc
parent 96f1c88327
2 changed files with 89 additions and 13 deletions
--- a/tensorflow-ops/src/TensorFlow/Gradient.hs
+++ b/tensorflow-ops/src/TensorFlow/Gradient.hs
@ -36,6 +36,7 @@ import Data.Int (Int32, Int64)
 import Data.Foldable (foldlM)
 import Data.List (foldl', sortBy)
 import Data.Map.Strict (Map)
 import qualified Data.IntSet as IntSet
 import Data.Maybe (fromMaybe, maybeToList, mapMaybe)
 import Data.Ord (comparing)
 import Data.ProtoLens.TextFormat (showMessage)
@ -165,6 +166,11 @@ gradients y xs = build $ do
        (\f x -> fromMaybe (error $ "no NodeDef found for " ++ show x) (f x))
        . flip Map.lookup
    let (gr, nodeMap) = createGraph yName nodeDefLookup
        xnodes = mapMaybe (\x -> nodeMap ^. (at . outputNodeName . renderedOutput $ x)) xs
        -- make a set of the nodes reachable from the xnodes
        -- The xnodes are not part of this set (unless reachable from another xnode)
        reachableSet = computeReachableSet xnodes gr
    -- Set gradient of y to one.
    -- TODO: nicer
    let initPending :: Map.Map FGL.Node (PendingGradients a)
@ -175,7 +181,7 @@ gradients y xs = build $ do
                                .~ [yOne]
                                )
    -- Calculate the gradients of y w.r.t. each node in the graph.
-    gradientMap <- graphGrads gr initPending
+    gradientMap <- graphGrads gr reachableSet initPending
    -- Lookup the gradients for each x.
    forM xs $ \x ->
        let Output i xName = renderedOutput x
@ -183,6 +189,13 @@ gradients y xs = build $ do
            n <- nodeMap ^. at xName
            gradientMap ^. at n . nonEmpty . outputIxAt i
 -- | Compute a set of nodes reachable from the start nodes
 --
 -- the start nodes are excluded, unless reachable from another start node
 computeReachableSet :: [FGL.Node] -> Graph -> IntSet.IntSet
 computeReachableSet vs g =
  IntSet.fromList $ concatMap (drop 1 . FGL.preorder) (FGL.dff vs g)
 outputIxAt :: OutputIx -> Lens' (IntMap.IntMap v) (Maybe v)
 outputIxAt = intAt . unOutputIx
@ -245,16 +258,15 @@ nonEmpty = anon mempty null
 -- | Calculate the gradients for every node in a graph.
 graphGrads :: forall a. GradientCompatible a
           => Graph
           -> IntSet.IntSet
           -> Map FGL.Node (PendingGradients a)
           -- ^ Initial gradients (usually just 1 for the node of interest).
           -> Build (Map FGL.Node (Gradients a))
-graphGrads gr initPending = view gradientsResult <$> foldlM go initState nodeOrder
+graphGrads gr reachableSet initPending = view gradientsResult <$> foldlM go initState nodeOrder
  where
    initState = GradientsState initPending Map.empty
    -- Reverse topological sort.
-    -- TODO(fmayle): Filter out nodes that are not successors of any x in xs to
+    nodeOrder = FGL.topsort . FGL.grev $ gr
    -- avoid calculating gradients that won't be used.
    nodeOrder = FGL.topsort $ FGL.grev gr
    go :: GradientsState a -> Int -> Build (GradientsState a)
    go state node = do
        -- Aggregate the accumulated gradients for this node.
@ -263,11 +275,17 @@ graphGrads gr initPending = view gradientsResult <$> foldlM go initState nodeOrd
        if null outputGrads
           then pure state
           else do
              let nextState = state & gradientsResult %~ Map.insert node outputGrads
              -- Only consider nodes that are reachable from the inputs to
              -- avoid calculating gradients that won't be used.
              if node `IntSet.member` reachableSet
                then do
                  let ctx = FGL.context gr node
                  inputGrads <- calculateInputGrads ctx outputGrads gr
                  -- Calculate the gradients for each of the node's inputs.
              let nextState = state & gradientsResult %~ Map.insert node outputGrads
                  pure $ updatePendingGradients ctx inputGrads nextState
                else
                  pure nextState
 -- | Reduce accumulated gradients for each output to one Tensor.
 sumPendingGradient :: GradientCompatible a
@ -839,11 +857,8 @@ opGrad "Fill" _ _ [dz] = [Nothing, Just $ sum dz rx]
 -- through each read.
 opGrad "ReadVariableOp" _ _ [dz] = [Just $ expr dz]
 -- TODO(fmayle): These can go away if we properly prune the graph.
 opGrad "Const" _ _ _ = [Nothing, Nothing]
 opGrad "Placeholder" _ _ _ = []
 opGrad "VarHandleOp" _ _ _ = []
 opGrad "Variable" _ _ _ = []
 opGrad "Sqrt" _ [toT -> x] [dz] = [Just $ sq' `CoreOps.mul` dz]
  where
--- a/tensorflow-ops/tests/GradientTest.hs
+++ b/tensorflow-ops/tests/GradientTest.hs
@ -32,7 +32,7 @@ import Control.Monad(forM_, replicateM, zipWithM)
 import Control.Monad.IO.Class (liftIO)
 import qualified TensorFlow.Core as TF
-import qualified TensorFlow.GenOps.Core as TF (conv2DBackpropInput', max, maximum, resizeBilinear', tile, pad, batchToSpaceND, spaceToBatchND, squeeze, sqrt, slice, shape)
+import qualified TensorFlow.GenOps.Core as TF (conv2DBackpropInput', max, maximum, resizeBilinear', tile, pad, batchToSpaceND, spaceToBatchND, squeeze, sqrt, slice, shape, diag)
 import qualified TensorFlow.Gradient as TF
 import qualified TensorFlow.Ops as TF hiding (zeroInitializedVariable, shape)
 import qualified TensorFlow.Output as TF
@ -123,6 +123,65 @@ testGradientDisconnected = testCase "testGradientDisconnected" $ do
                   ]
    sort expected @=? sort ops
 testGradientIncidental :: Test
 testGradientIncidental = testCase "testGradientIncidental" $ do
    let grads = do
            x <- TF.render $ TF.scalar (3 :: Float)
            b <- TF.render $ TF.scalar (4 :: Float)
            w <- TF.render $ TF.diag $ TF.vector [ 1.0 :: Float ]
            let incidental = b `TF.mul` w
            let y = (x `TF.mul` b) `TF.add` incidental
            TF.gradients y [x]
    -- Assert that the gradients are right.
    [dx] <- TF.runSession $ grads >>= TF.run
    4 @=? TF.unScalar dx
    -- Assert that the graph has the expected ops.
    let graphDef = TF.asGraphDef grads
    putStrLn $ showMessage graphDef
    let ops = graphDef ^.. node . traverse . op
        expected = [ "Add"
                   , "BroadcastGradientArgs"
                   , "BroadcastGradientArgs"
                   , "Const"
                   , "Const"
                   , "Const"
                   , "Const"
                   , "Diag"
                   , "Fill"
                   , "Mul"
                   , "Mul"
                   , "Mul"
                   , "Mul"
                   , "Reshape"
                   , "Reshape"
                   , "Reshape"
                   , "Reshape"
                   , "Shape"
                   , "Shape"
                   , "Shape"
                   , "Shape"
                   , "Shape"
                   , "Sum"
                   , "Sum"
                   , "Sum"
                   , "Sum"
                   ]
    sort expected @=? sort ops
 testGradientPruning :: Test
 testGradientPruning = testCase "testGradientPruning" $ do
    let grads = do
            x <- TF.render $ TF.scalar (3 :: Float)
            b <- TF.render $ TF.scalar (4 :: Float)
            bx <- TF.render $ b `TF.mul` x
            let y = bx `TF.add` b
            TF.gradients y [x, bx]
    -- Assert that the gradients are right.
    [dx, dxb] <- TF.runSession $ grads >>= TF.run
    4 @=? TF.unScalar dx
    1 @=? TF.unScalar dxb
 -- Test that identical "stateful" ops work with createGraph.
 testCreateGraphStateful :: Test
@ -545,6 +604,8 @@ main :: IO ()
 main = defaultMain
            [ testGradientSimple
            , testGradientDisconnected
            , testGradientIncidental
            , testGradientPruning
            , testCreateGraphStateful
            , testCreateGraphNameScopes
            , testDiamond