From 0603a6987b933008cc851357740c9515229dc235 Mon Sep 17 00:00:00 2001 From: fkm3 Date: Thu, 25 May 2017 19:19:22 -0700 Subject: [PATCH] Add Minimize module with gradient descent and adam implementations (#125) --- README.md | 22 ++--- tensorflow-mnist/app/Main.hs | 10 +- tensorflow-ops/src/TensorFlow/Gradient.hs | 3 +- tensorflow-ops/src/TensorFlow/Minimize.hs | 115 ++++++++++++++++++++++ tensorflow-ops/src/TensorFlow/Variable.hs | 59 +++++++++++ tensorflow-ops/tensorflow-ops.cabal | 1 + tensorflow-ops/tests/MatrixTest.hs | 26 ++--- tensorflow-ops/tests/RegressionTest.hs | 22 ++--- 8 files changed, 203 insertions(+), 55 deletions(-) create mode 100644 tensorflow-ops/src/TensorFlow/Minimize.hs diff --git a/README.md b/README.md index 8ebb889..09e7121 100644 --- a/README.md +++ b/README.md @@ -20,14 +20,15 @@ Toy example of a linear regression model ([full code](tensorflow-ops/tests/RegressionTest.hs)): ```haskell -import Control.Monad (replicateM, replicateM_, zipWithM) +import Control.Monad (replicateM, replicateM_) import System.Random (randomIO) import Test.HUnit (assertBool) import qualified TensorFlow.Core as TF import qualified TensorFlow.GenOps.Core as TF -import qualified TensorFlow.Gradient as TF -import qualified TensorFlow.Ops as TF +import qualified TensorFlow.Minimize as TF +import qualified TensorFlow.Ops as TF hiding (initializedVariable) +import qualified TensorFlow.Variable as TF main :: IO () main = do @@ -48,23 +49,14 @@ fit xData yData = TF.runSession $ do w <- TF.initializedVariable 0 b <- TF.initializedVariable 0 -- Define the loss function. - let yHat = (x `TF.mul` w) `TF.add` b + let yHat = (x `TF.mul` TF.readValue w) `TF.add` TF.readValue b loss = TF.square (yHat `TF.sub` y) -- Optimize with gradient descent. - trainStep <- gradientDescent 0.001 loss [w, b] + trainStep <- TF.minimizeWith (TF.gradientDescent 0.001) loss [w, b] replicateM_ 1000 (TF.run trainStep) -- Return the learned parameters. - (TF.Scalar w', TF.Scalar b') <- TF.run (w, b) + (TF.Scalar w', TF.Scalar b') <- TF.run (TF.readValue w, TF.readValue b) return (w', b') - -gradientDescent :: Float - -> TF.Tensor TF.Build Float - -> [TF.Tensor TF.Ref Float] - -> TF.Session TF.ControlNode -gradientDescent alpha loss params = do - let applyGrad param grad = - TF.assign param (param `TF.sub` (TF.scalar alpha `TF.mul` grad)) - TF.group =<< zipWithM applyGrad params =<< TF.gradients loss params ``` # Installation Instructions diff --git a/tensorflow-mnist/app/Main.hs b/tensorflow-mnist/app/Main.hs index a2dc0d7..19b8b58 100644 --- a/tensorflow-mnist/app/Main.hs +++ b/tensorflow-mnist/app/Main.hs @@ -15,7 +15,7 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedLists #-} -import Control.Monad (zipWithM, when, forM_) +import Control.Monad (forM_, when) import Control.Monad.IO.Class (liftIO) import Data.Int (Int32, Int64) import Data.List (genericLength) @@ -23,9 +23,9 @@ import qualified Data.Text.IO as T import qualified Data.Vector as V import qualified TensorFlow.Core as TF -import qualified TensorFlow.Gradient as TF import qualified TensorFlow.Ops as TF hiding (initializedVariable, zeroInitializedVariable) import qualified TensorFlow.Variable as TF +import qualified TensorFlow.Minimize as TF import TensorFlow.Examples.MNIST.InputData import TensorFlow.Examples.MNIST.Parse @@ -87,11 +87,7 @@ createModel = do loss = reduceMean $ fst $ TF.softmaxCrossEntropyWithLogits logits labelVecs params = [hiddenWeights, hiddenBiases, logitWeights, logitBiases] - grads <- TF.gradients loss params - - let lr = TF.scalar 0.00001 - applyGrad param grad = TF.assignAdd param (negate $ lr `TF.mul` grad) - trainStep <- TF.group =<< zipWithM applyGrad params grads + trainStep <- TF.minimizeWith TF.adam loss params let correctPredictions = TF.equal predict labels errorRateTensor <- TF.render $ 1 - reduceMean (TF.cast correctPredictions) diff --git a/tensorflow-ops/src/TensorFlow/Gradient.hs b/tensorflow-ops/src/TensorFlow/Gradient.hs index 4056101..d07e9e9 100644 --- a/tensorflow-ops/src/TensorFlow/Gradient.hs +++ b/tensorflow-ops/src/TensorFlow/Gradient.hs @@ -22,7 +22,8 @@ {-# LANGUAGE ViewPatterns #-} module TensorFlow.Gradient - ( gradients + ( GradientCompatible + , gradients ) where import Control.Monad (forM, zipWithM) diff --git a/tensorflow-ops/src/TensorFlow/Minimize.hs b/tensorflow-ops/src/TensorFlow/Minimize.hs new file mode 100644 index 0000000..0062d54 --- /dev/null +++ b/tensorflow-ops/src/TensorFlow/Minimize.hs @@ -0,0 +1,115 @@ +-- Copyright 2016 TensorFlow authors. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +module TensorFlow.Minimize + ( Minimizer + , minimizeWith + , gradientDescent + , AdamConfig(..) + , adam + , adam' + ) where + +import Control.Monad (zipWithM) +import Data.Default (Default(..)) +import Data.List (zipWith4) +import Data.Maybe (fromMaybe) + +import qualified TensorFlow.Core as TF +import qualified TensorFlow.Gradient as TF +import qualified TensorFlow.Ops as TF hiding (assign, initializedVariable) +import qualified TensorFlow.Variable as TF + +-- | Functions that minimize a loss w.r.t. a set of 'TF.Variable's. +-- +-- Generally only performs one step of an iterative algorithm. +-- +-- 'Minimizer's are defined as a function of the gradients instead of +-- the loss so that users can apply transformations to the gradients. +type Minimizer a = + forall m. TF.MonadBuild m => + [TF.Variable a] -> [TF.Tensor TF.Value a] -> m TF.ControlNode + +-- | Convenience wrapper around 'TF.gradients' and a 'Minimizer'. +minimizeWith :: (TF.MonadBuild m, TF.GradientCompatible a) + => Minimizer a + -> TF.Tensor v a -- ^ Loss. + -> [TF.Variable a] -- ^ Parameters of the loss function. + -> m TF.ControlNode +minimizeWith minimizer loss params = + TF.gradients loss params >>= minimizer params + +-- | Perform one step of the gradient descent algorithm. +gradientDescent :: TF.GradientCompatible a + => a -- ^ Learning rate. + -> Minimizer a +gradientDescent learningRate params grads = TF.withNameScope "gradientDescent" $ do + let applyGrad param grad = + TF.assignAdd param (TF.scalar (-learningRate) `TF.mul` grad) + TF.group =<< zipWithM applyGrad params grads + +-- TODO: Support more than Float in adam. + +data AdamConfig = AdamConfig + { adamLearningRate :: Float + , adamBeta1 :: Float + , adamBeta2 :: Float + , adamEpsilon :: Float + } + +instance Default AdamConfig where + -- Recommended defaults from the adam paper. + def = AdamConfig 0.001 0.9 0.999 1e-8 + +-- | Perform one step of the adam algorithm. +-- +-- See https://arxiv.org/abs/1412.6980. +-- +-- NOTE: Currently requires all 'TF.Variable's to have an 'TF.initializedValue'. +adam :: Minimizer Float +adam = adam' def + +adam' :: AdamConfig -> Minimizer Float +adam' config params grads = TF.withNameScope "adam" $ do + let lr = TF.scalar (adamLearningRate config) + beta1 = TF.scalar (adamBeta1 config) + beta2 = TF.scalar (adamBeta2 config) + epsilon = TF.scalar (adamEpsilon config) + -- Create adam state variables. + let errorMsg = "TensorFlow.Minimize.adam requires an initial value for all variables" + initVal = fromMaybe (error errorMsg) . TF.initializedValue + ms <- mapM (TF.initializedVariable . TF.zerosLike . initVal) params + vs <- mapM (TF.initializedVariable . TF.zerosLike . initVal) params + beta1Power <- TF.initializedVariable beta1 + beta2Power <- TF.initializedVariable beta2 + -- Perform adam update. + let applyGrad param m v = + TF.resourceApplyAdam param m v + (TF.readValue beta1Power) + (TF.readValue beta2Power) + lr beta1 beta2 epsilon + updateVars <- sequence $ zipWith4 applyGrad params ms vs grads + -- Update beta variables after adam update. + let updateBeta betaPower beta = + TF.withControlDependencies updateVars + (TF.assign betaPower (TF.readValue betaPower `TF.mul` beta)) + updateBeta1 <- updateBeta beta1Power beta1 + updateBeta2 <- updateBeta beta2Power beta2 + TF.group (updateBeta1:updateBeta2:updateVars) diff --git a/tensorflow-ops/src/TensorFlow/Variable.hs b/tensorflow-ops/src/TensorFlow/Variable.hs index 5c8f6c7..6c752db 100644 --- a/tensorflow-ops/src/TensorFlow/Variable.hs +++ b/tensorflow-ops/src/TensorFlow/Variable.hs @@ -6,6 +6,8 @@ -- TODO: given that distinction, figure out a good story around -- gradients and save/restore. Then, merge this module into -- TensorFlow.Ops. +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE RecursiveDo #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} @@ -23,8 +25,13 @@ module TensorFlow.Variable , assign' , assignAdd , assignAdd' + , resourceApplyAdam + , resourceApplyAdam' ) where +import qualified Data.Complex +import qualified Data.Int +import qualified Data.Word import Data.Text.Encoding (encodeUtf8) import Lens.Family2 ((.~), (&)) import TensorFlow.Core @@ -133,3 +140,55 @@ assignAdd = assignAdd' id assignAdd' :: (MonadBuild m, TensorType a) => OpParams -> Variable a -> Tensor v a -> m ControlNode assignAdd' params (Variable h _) v = CoreOps.assignAddVariableOp' params h v + +-- | Update '*var' according to the Adam algorithm. +-- +-- lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) +-- m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t +-- v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t +-- variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) +resourceApplyAdam :: + (MonadBuild m, + OneOf '[(Data.Complex.Complex Double), + (Data.Complex.Complex Float), + Data.Int.Int16, + Data.Int.Int32, + Data.Int.Int64, Data.Int.Int8, + Data.Word.Word16, + Data.Word.Word8, Double, + Float] t) + => Variable t -- ^ __var__: Should be from a Variable(). + -> Variable t -- ^ __m__: Should be from a Variable(). + -> Variable t -- ^ __v__: Should be from a Variable(). + -> Tensor v1 t -- ^ __beta1_power__: Must be a scalar. + -> Tensor v2 t -- ^ __beta2_power__: Must be a scalar. + -> Tensor v3 t -- ^ __lr__: Scaling factor. Must be a scalar. + -> Tensor v4 t -- ^ __beta1__: Momentum factor. Must be a scalar. + -> Tensor v5 t -- ^ __beta2__: Momentum factor. Must be a scalar. + -> Tensor v6 t -- ^ __epsilon__: Ridge term. Must be a scalar. + -> Tensor v7 t -- ^ __grad__: The gradient. + -> m (ControlNode) +resourceApplyAdam = resourceApplyAdam' id + +resourceApplyAdam' :: + (MonadBuild m, + OneOf '[(Data.Complex.Complex Double), + (Data.Complex.Complex Float), + Data.Int.Int16, Data.Int.Int32, + Data.Int.Int64, Data.Int.Int8, + Data.Word.Word16, Data.Word.Word8, Double, + Float] t) + => OpParams + -> Variable t -- ^ __var__: Should be from a Variable(). + -> Variable t -- ^ __m__: Should be from a Variable(). + -> Variable t -- ^ __v__: Should be from a Variable(). + -> Tensor v1 t -- ^ __beta1_power__: Must be a scalar. + -> Tensor v2 t -- ^ __beta2_power__: Must be a scalar. + -> Tensor v3 t -- ^ __lr__: Scaling factor. Must be a scalar. + -> Tensor v4 t -- ^ __beta1__: Momentum factor. Must be a scalar. + -> Tensor v5 t -- ^ __beta2__: Momentum factor. Must be a scalar. + -> Tensor v6 t -- ^ __epsilon__: Ridge term. Must be a scalar. + -> Tensor v7 t -- ^ __grad__: The gradient. + -> m (ControlNode) +resourceApplyAdam' params (Variable var _) (Variable m _) (Variable v _) = + CoreOps.resourceApplyAdam' params var m v diff --git a/tensorflow-ops/tensorflow-ops.cabal b/tensorflow-ops/tensorflow-ops.cabal index 20dd9f3..08f5259 100644 --- a/tensorflow-ops/tensorflow-ops.cabal +++ b/tensorflow-ops/tensorflow-ops.cabal @@ -17,6 +17,7 @@ library exposed-modules: TensorFlow.Gradient , TensorFlow.Ops , TensorFlow.EmbeddingOps + , TensorFlow.Minimize , TensorFlow.NN , TensorFlow.Queue , TensorFlow.Variable diff --git a/tensorflow-ops/tests/MatrixTest.hs b/tensorflow-ops/tests/MatrixTest.hs index b990fbd..bace098 100644 --- a/tensorflow-ops/tests/MatrixTest.hs +++ b/tensorflow-ops/tests/MatrixTest.hs @@ -2,13 +2,14 @@ {-# LANGUAGE OverloadedLists #-} import Control.Monad.IO.Class (liftIO) -import Control.Monad (replicateM_, zipWithM) +import Control.Monad (replicateM_) -import qualified TensorFlow.GenOps.Core as TF (square, rank) -import qualified TensorFlow.Core as TF -import qualified TensorFlow.Gradient as TF -import qualified TensorFlow.Ops as TF import qualified Data.Vector as V +import qualified TensorFlow.Core as TF +import qualified TensorFlow.GenOps.Core as TF (square, rank) +import qualified TensorFlow.Minimize as TF +import qualified TensorFlow.Ops as TF hiding (initializedVariable) +import qualified TensorFlow.Variable as TF import Test.Framework (defaultMain, Test) import Test.Framework.Providers.HUnit (testCase) @@ -26,22 +27,13 @@ fitMatrix = testCase "fitMatrix" $ TF.runSession $ do v <- TF.initializedVariable =<< randomParam [1, 2] let ones = [1, 1, 1, 1] :: [Float] matx = TF.constant [2, 2] ones - diff = matx `TF.sub` (u `TF.matMul` v) + diff = matx `TF.sub` (TF.readValue u `TF.matMul` TF.readValue v) loss = reduceMean $ TF.square diff - trainStep <- gradientDescent 0.01 loss [u, v] + trainStep <- TF.minimizeWith (TF.gradientDescent 0.01) loss [u, v] replicateM_ 1000 (TF.run trainStep) - (u',v') <- TF.run (u, v) + (u',v') <- TF.run (TF.readValue u, TF.readValue v) -- ones = u * v liftIO $ assertAllClose (V.fromList ones) ((*) <$> u' <*> v') - -gradientDescent :: Float - -> TF.Tensor TF.Build Float - -> [TF.Tensor TF.Ref Float] - -> TF.Session TF.ControlNode -gradientDescent alpha loss params = do - let applyGrad param grad = - TF.assign param (param `TF.sub` (TF.scalar alpha `TF.mul` grad)) - TF.group =<< zipWithM applyGrad params =<< TF.gradients loss params main :: IO () main = defaultMain [ fitMatrix ] diff --git a/tensorflow-ops/tests/RegressionTest.hs b/tensorflow-ops/tests/RegressionTest.hs index 7fd2e73..0190b19 100644 --- a/tensorflow-ops/tests/RegressionTest.hs +++ b/tensorflow-ops/tests/RegressionTest.hs @@ -1,13 +1,14 @@ -- | Simple linear regression example for the README. -import Control.Monad (replicateM, replicateM_, zipWithM) +import Control.Monad (replicateM, replicateM_) import System.Random (randomIO) import Test.HUnit (assertBool) import qualified TensorFlow.Core as TF import qualified TensorFlow.GenOps.Core as TF -import qualified TensorFlow.Gradient as TF -import qualified TensorFlow.Ops as TF +import qualified TensorFlow.Minimize as TF +import qualified TensorFlow.Ops as TF hiding (initializedVariable) +import qualified TensorFlow.Variable as TF main :: IO () main = do @@ -28,20 +29,11 @@ fit xData yData = TF.runSession $ do w <- TF.initializedVariable 0 b <- TF.initializedVariable 0 -- Define the loss function. - let yHat = (x `TF.mul` w) `TF.add` b + let yHat = (x `TF.mul` TF.readValue w) `TF.add` TF.readValue b loss = TF.square (yHat `TF.sub` y) -- Optimize with gradient descent. - trainStep <- gradientDescent 0.001 loss [w, b] + trainStep <- TF.minimizeWith (TF.gradientDescent 0.001) loss [w, b] replicateM_ 1000 (TF.run trainStep) -- Return the learned parameters. - (TF.Scalar w', TF.Scalar b') <- TF.run (w, b) + (TF.Scalar w', TF.Scalar b') <- TF.run (TF.readValue w, TF.readValue b) return (w', b') - -gradientDescent :: Float - -> TF.Tensor TF.Build Float - -> [TF.Tensor TF.Ref Float] - -> TF.Session TF.ControlNode -gradientDescent alpha loss params = do - let applyGrad param grad = - TF.assign param (param `TF.sub` (TF.scalar alpha `TF.mul` grad)) - TF.group =<< zipWithM applyGrad params =<< TF.gradients loss params