Fix documentation in distances.py and multimodal_lib.py to conform to the TensorFlow documentation style.

arjung · tensorflow-copybara · commit a9c218f62743 · 2019-09-09T12:01:12.000-07:00
PiperOrigin-RevId: 268045930
diff --git a/neural_structured_learning/lib/distances.py b/neural_structured_learning/lib/distances.py
@@ -23,7 +23,7 @@
 
 
 def _assert_multinomial_distribution(input_tensor, axis):
-  """Assert input has valid multinomial distribution along specified axis."""
+  """Assert input has valid multinomial distribution along `axis`."""
   sum_of_multinomial_distribution = tf.reduce_sum(
       input_tensor=input_tensor, axis=axis)
   return [
@@ -36,7 +36,7 @@ def _assert_multinomial_distribution(input_tensor, axis):
 
 
 def _assert_valid_axis(ndims, axis):
-  """Assert the condition `-ndims < axis <= ndims` if axis is not None."""
+  """Assert the condition `-ndims < axis <= ndims` if `axis` is not `None`."""
   if axis and (axis < -ndims or axis >= ndims):
     raise ValueError('axis = %d not in [%d, %d)' % (axis, -ndims, ndims))
 
@@ -58,39 +58,41 @@ def kl_divergence(
   """Adds a KL-divergence to the training procedure.
 
   For brevity, let `P = labels` and `Q = predictions`. The
-  Kullback-Leibler divergence`KL(P||Q)` is
+  Kullback-Leibler divergence `KL(P||Q)` is:
 
-  losses = P * log(P) - P * log(Q)
+  ```
+  KL(P||Q) = P * log(P) - P * log(Q)
+  ```
 
-  Note, the function assumes that `predictions` and `labels` are the values of
-  multinomial distribution, i.e., each value is the probability of the
+  Note: the function assumes that `predictions` and `labels` are the values of
+  a multinomial distribution, i.e., each value is the probability of the
   corresponding class.
 
-  For the usage of `weights` and `reduction`, please refer to tf.losses.
+  For the usage of `weights` and `reduction`, please refer to `tf.losses`.
 
   Args:
-    labels: `Tensor` of type float32 or float64, with shape `[d1, ..., dN,
-      num_classes]`, represents target distribution.
+    labels: `Tensor` of type `float32` or `float64`, with shape `[d1, ..., dN,
+      num_classes]`, represents the target distribution.
     predictions: `Tensor` of the same type and shape as `labels`, represents
-      predicted distribution.
-    axis: The dimension along which the KL divergence is computed. Note, the
-      values of `labels` and `predictions` along the `axis` should meet the
-      condition of multinomial distribution.
-    weights: (optional) `Tensor` whose rank is either 0, or the same rank as
+      the predicted distribution.
+    axis: The dimension along which the KL divergence is computed. The values
+      of `labels` and `predictions` along `axis` should meet the requirements
+      of a multinomial distribution.
+    weights: (optional) `Tensor` whose rank is either 0, or the same as that of
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which the loss will be added.
-    reduction: Type of reduction to apply to loss.
+    loss_collection: Collection to which the loss will be added.
+    reduction: Type of reduction to apply to the loss.
 
   Returns:
-    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
-    shape as `labels`; otherwise, it is scalar.
+    Weighted loss `float` `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`, otherwise, it is a scalar.
   Raises:
-    InvalidArgumentError: If `labels` or `predictions` doesn't meet the
-    condition of multinomial distribution.
-    ValueError: If `axis` is None, or the shape of `predictions` doesn't match
-    that of `labels` or if the shape of `weights` is invalid.
+    InvalidArgumentError: If `labels` or `predictions` don't meet the
+      requirements of a multinomial distribution.
+    ValueError: If `axis` is `None`, if the shape of `predictions` doesn't
+      match that of `labels`, or if the shape of `weights` is invalid.
   """
   with tf.compat.v1.name_scope(scope, 'kl_divergence',
                                (predictions, labels, weights)) as scope:
@@ -121,40 +123,44 @@ def jensen_shannon_divergence(
   """Adds a Jensen-Shannon divergence to the training procedure.
 
   For brevity, let `P = labels`, `Q = predictions`, `KL(P||Q)` be the
-  Kullback-Leibler divergence. The Jensen-Shannon divergence (JSD) is
+  Kullback-Leibler divergence as defined in the description of the
+  `nsl.lib.kl_divergence` function.". The Jensen-Shannon divergence (JSD) is
 
-      M = (P + Q) / 2
-      JSD(P||Q) = KL(P||M) / 2 + KL(Q||M) / 2
+  ```
+  M = (P + Q) / 2
+  JSD(P||Q) = KL(P||M) / 2 + KL(Q||M) / 2
+  ```
 
-  Note, the function assumes that `predictions` and `labels` are the values of
+  This function assumes that `predictions` and `labels` are the values of a
   multinomial distribution, i.e., each value is the probability of the
   corresponding class.
 
-  For the usage of `weights` and `reduction`, please refer to tf.losses.
+  For the usage of `weights` and `reduction`, please refer to `tf.losses`.
 
   Args:
-    labels: `Tensor` of type float32 or float64, with shape `[d1, ..., dN,
-      num_classes]`, represents target distribution.
+    labels: `Tensor` of type `float32` or `float64`, with shape `[d1, ..., dN,
+      num_classes]`, represents the target distribution.
     predictions: `Tensor` of the same type and shape as `labels`, represents
-      predicted distribution.
+      the predicted distribution.
     axis: The dimension along which the Jensen-Shannon divergence is computed.
-      Note, the values of `labels` and `predictions` along the `axis` should
-      meet the condition of multinomial distribution.
-    weights: (optional) `Tensor` whose rank is either 0, or the same rank as
+      The values of `labels` and `predictions` along `axis` should meet the
+      requirements of a multinomial distribution.
+    weights: (optional) `Tensor` whose rank is either 0, or the same as that of
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which the loss will be added.
-    reduction: Type of reduction to apply to loss.
+    loss_collection: Collection to which the loss will be added.
+    reduction: Type of reduction to apply to the loss.
 
   Returns:
-    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
-    shape as `labels`; otherwise, it is scalar.
+    Weighted loss `float` `Tensor`. If `reduction` is
+    `tf.compat.v1.losses.Reduction.MEAN`, this has the same shape as `labels`,
+    otherwise, it is a scalar.
   Raises:
-    InvalidArgumentError: If `labels` or `predictions` doesn't meet the
-    condition of multinomial distribution.
-    ValueError: If `axis` is None, or the shape of `predictions` doesn't match
-    that of `labels` or if the shape of `weights` is invalid.
+    InvalidArgumentError: If `labels` or `predictions` don't meet the
+      requirements of a multinomial distribution.
+    ValueError: If `axis` is `None`, the shape of `predictions` doesn't match
+      that of `labels`, or if the shape of `weights` is invalid.
   """
   with tf.compat.v1.name_scope(scope, 'jensen_shannon_divergence',
                                (predictions, labels, weights)) as scope:
@@ -177,7 +183,7 @@ def jensen_shannon_divergence(
 
 
 def _apply_transform(batched_tensor, transform_type, axis=None):
-  """Applys the given transform function to the batched_tensor along axis."""
+  """Applies the given transform function to `batched_tensor` along `axis`."""
   if transform_type == configs.TransformType.SOFTMAX:
     return tf.nn.softmax(batched_tensor, axis=axis)
   else:
@@ -217,60 +223,60 @@ def pairwise_distance_wrapper(sources,
                               targets,
                               weights=1.0,
                               distance_config=None):
-  """A wrapper to compute pairwise distance between sources and targets.
+  """A wrapper to compute the pairwise distance between `sources` and `targets`.
 
-  distances = weights * distance_type(sources, targets)
+  `distances = weights * distance_config.distance_type(sources, targets)`
 
   This wrapper calculates the weighted distance between `(sources, targets)`
   pairs, and provides an option to return the distance as the sum over the
   difference along the given axis, when vector based distance is needed.
 
-  For the usage of `weights` and `reduction`, please refer to tf.losses. For the
-  usage of `sum_over_axis`, see the following examples:
+  For the usage of `weights` and `reduction`, please refer to `tf.losses`. For
+  the usage of `sum_over_axis`, see the following examples:
 
-  Given target tensors with shape `[batch_size, features]`, reduction set to
-  be MEAN, and `sum_over_axis` set to be last dimension, the weighted average
-  distance of `sample pairs` will be returned. For example:
-  With a distance_config('L2', sum_over_axis=-1), the distance between
-  [[1, 1], [2, 2], [0, 2], [5, 5]] and [[1, 1], [0, 2], [4, 4], [1, 4]] will be
-  {(0+0) + (4+0) + (16+4) + (16+1)}/4 = 10.25
+  Given target tensors with shape `[batch_size, features]`, the reduction set to
+  `tf.compat.v1.losses.Reduction.MEAN`, and `sum_over_axis` set to the last
+  dimension, the weighted average distance of sample pairs will be returned.
+  For example: With a distance_config('L2', sum_over_axis=-1), the distance
+  between [[1, 1], [2, 2], [0, 2], [5, 5]] and [[1, 1], [0, 2], [4, 4], [1, 4]]
+  will be {(0+0) + (4+0) + (16+4) + (16+1)}/4 = 10.25
 
-  If `sum_over_axis` is None, the weighted average distance of `feature pairs`
-  (instead of sample pairs) will be returned. For example:
-  With a distance_config('L2'), the distance between
+  If `sum_over_axis` is `None`, the weighted average distance of feature pairs
+  (instead of sample pairs) will be returned. For example: With a
+  distance_config('L2'), the distance between
   [[1, 1], [2, 2], [0, 2], [5, 5]] and [[1, 1], [0, 2], [4, 4], [1, 4]] will be
   {(0+0) + (4+0) + (16+4) + (16+1)}/8 = 5.125
 
-  If `transform_fn` is not None, the transform function is applied to both
-  sources and targets before computing the distance. For example:
-  distance_config('KL_DIVERGENCE', sum_over_axis=-1, transform_fn='SOFTMAX')
+  If `transform_fn` is not `None`, the transform function is applied to both
+  `sources` and `targets` before computing the distance. For example:
+  `distance_config('KL_DIVERGENCE', sum_over_axis=-1, transform_fn='SOFTMAX')`
   treats `sources` and `targets` as logits, and computes the KL-divergence
-  between the probability distributions.
+  between the two probability distributions.
 
   Args:
-    sources:  `Tensor` of type float32 or float64.
-    targets: `Tensor` of the same type and shape as sources.
-    weights: (optional) `Tensor` whose rank is either 0, or the same rank as
+    sources: `Tensor` of type `float32` or `float64`.
+    targets: `Tensor` of the same type and shape as `sources`.
+    weights: (optional) `Tensor` whose rank is either 0, or the same as that of
       `targets`, and must be broadcastable to `targets` (i.e., all dimensions
-      must be either `1`, or the same as the corresponding `distance`
-      dimension).
-    distance_config: DistanceConfig contains the following configs (or
-      hyper-parameters) for computing distances:
-      (a) 'distance_type': Type of distance function to apply.
-      (b) 'reduction': Type of distance reduction. Refer to tf.losses.Reduction.
-      (c) 'sum_over_axis': (optional) The distance is sum over the difference
-        along the axis. Note, if `sum_over_axis` is not None and the rank of
-        `weights` is nonzero, the size of `weights` along the `sum_over_axis`
-        must be 1.
-      (d) 'transform_fn': (optional) If set, both sources and targets will be
-        transformed before calculating the distance. If set to 'SOFTMAX', it
-        will be performed on the axis specified by 'sum_over_axis', or -1 if
-        that is not specified. If None, the default distance config will be
+      must be either `1`, or the same as the corresponding distance dimension).
+    distance_config: An instance of `nsl.configs.DistanceConfig` that contains
+      the following configuration (or hyperparameters) for computing distances:
+      (a) `distance_type`: Type of distance function to apply.
+      (b) `reduction`: Type of distance reduction. See `tf.losses.Reduction`.
+      (c) `sum_over_axis`: (optional) The distance is the sum over the
+        difference along the specified axis. Note that if `sum_over_axis` is not
+        `None` and the rank of `weights` is non-zero, then the size of `weights`
+        along `sum_over_axis` must be 1.
+      (d) `transform_fn`: (optional) If set, both `sources` and `targets` will
+        be transformed before calculating the distance. If set to 'SOFTMAX', it
+        will be performed on the axis specified by 'sum_over_axis', or -1 if the
+        axis is not specified. If `None`, the default distance config will be
         used.
 
   Returns:
-    Weighted distance scalar `Tensor`. If `reduction` is `NONE`, this has the
-      same shape as `targets`.
+    Weighted distance scalar `Tensor`. If `reduction` is
+      `tf.compat.v1.losses.Reduction.MEAN`, this has the same shape as
+      `targets`.
   Raises:
     ValueError: If the shape of targets doesn't match that of sources, or if the
       shape of weights is invalid.
diff --git a/neural_structured_learning/lib/multimodal_lib.py b/neural_structured_learning/lib/multimodal_lib.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Libs/utils for multimodal integration for Neural Structured Learning."""
+"""Utilities for multimodal integration for Neural Structured Learning."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,7 +23,7 @@
 
 
 def _bimodal_op(x, y, op_config):
-  """Apply bimodal integration operation to inputs."""
+  """Applies a bimodal integration operation to the inputs `x` and `y`."""
   if op_config.integration_type == configs.IntegrationType.ADD:
     return x + y
   elif op_config.integration_type == configs.IntegrationType.MUL:
@@ -54,37 +54,36 @@ def bimodal_integration(x,
                         integration_config,
                         reuse=None,
                         scope=None):
-  """Compute the bimodal integration between x and y.
+  """Computes the bimodal integration between `x` and `y`.
 
-    The inputs `x` and `y` are usually from two different types of sources,
-    e.g., `x` represents image embeddings and `y` represent text embeddings.
-    This function will integrate bimodal inputs `x` and `y` by the following:
+    The inputs `x` and `y` are usually from two different types of input
+    sources, e.g., `x` may represent image embeddings and `y` may represent text
+    embeddings. This function will integrate bimodal inputs `x` and `y` as
+    follows:
 
-    `outputs = fc_layer(
-        activation_fn(integration_type(fc_layer(x), fc_layer(y))))`
+    ```
+    outputs = fc_layer(activation_fn(integrate(fc_layer(x), fc_layer(y))))
+    ```,
+    where `fc_layer` represents a fully connected layer.
 
-    When the integration_type is (elementwise) 'additive', this function will is
-    equivalent to concat `x` and `y` and pass them into a two-layer perception.
-    When the integration_type is (elementwise) 'multiplicative', this function
-    is equivalent to multimodal low-rank bilinear Pooling (MLB) in
-    arXiv:1610.04325.
-    When the integration_type is 'tucker_decomp', this function is equivalent to
-    multimodal tensor-based Tucker decomposition (MUTAN) in arXiv:1705.06676.
+    When the integration type is (element-wise) 'additive', this function is
+    equivalent to concatenating `x` and `y` and passing the result into a
+    two-layer perceptron. When the integration type is (element-wise)
+    'multiplicative', this function is equivalent to [multimodal low-rank
+    bilinear Pooling (MLB)](https://arxiv.org/abs/1610.04325). When the
+    integration type is 'tucker_decomp', this function is equivalent to
+    [multimodal tensor-based Tucker decomposition
+    (MUTAN)](https://arxiv.org/abs/1705.06676).
 
   Args:
-    x: A tensor of at least rank 2 and static value for the last dimension; i.e.
-      [batch_size, depth], [None, None, None, channels].
-    y: A tensor of the same type and shape as `x`, except the size of the last
-      dimension can be different.
+    x: A tensor of rank at least 2 and a static value for the last dimension.
+      For example, `[batch_size, depth]`, `[None, None, None, channels]`, etc.
+    y: A tensor of the same type and shape as `x`, except that the size of the
+      last dimension can be different.
     output_dims: Integer or long, the number of output units.
-    integration_config: IntegrationConfig contains the following configs (or
-      hyper-parameters) for computing the hidden integration of `x` and `y`:
-      (a) integration_type: Type of integration function to apply.
-      (b) hidden_dims: Integer or a list of Integer, the number of hidden units
-        in the fully-connected layer(s) before the output layer.
-      (c) activation_fn: Activation function to be applied to.
-    reuse: Whether or not the layer and its variables should be reused. To be
-      able to reuse the layer scope must be given.
+    integration_config: An instance of `nsl.configs.IntegrationConfig`.
+    reuse: Whether or not the fully-connected layers and their variables should
+      be reused. To be able to reuse them, `scope` must be specified.
     scope: Optional scope for `variable_scope`.
 
   Returns: