Skip to content

Commit 92a9cc1

Browse files
committed
Added cloudwatch alerts
1 parent c3a0fbb commit 92a9cc1

File tree

3 files changed

+172
-1
lines changed

3 files changed

+172
-1
lines changed

examples/complete/main.tf

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
locals {
22
name = "redis"
3-
region = "us-west-2"
3+
region = "us-east-1"
44
family = "redis6.x"
55
node_type = "cache.t3.small"
66
vpc_cidr = "10.0.0.0/16"
@@ -114,6 +114,11 @@ module "redis" {
114114
cloudwatch_metric_alarms_enabled = true
115115
alarm_cpu_threshold_percent = 70
116116
alarm_memory_threshold_bytes = "10000000" # in bytes
117+
alarm_eviction_threshold = 1000
118+
alarm_connections_threshold = 100
119+
alarm_replication_lag_threshold = 10
120+
alarm_cache_hits_threshold = 1000
121+
alarm_cache_misses_threshold = 50
117122
slack_notification_enabled = false
118123
slack_username = ""
119124
slack_channel = ""

main.tf

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,141 @@ resource "aws_cloudwatch_metric_alarm" "cache_memory" {
261261
)
262262
}
263263

264+
# Alarm for Evictions
265+
resource "aws_cloudwatch_metric_alarm" "cache_evictions" {
266+
count = var.cloudwatch_metric_alarms_enabled ? 1 : 0
267+
alarm_name = format("%s-%s-%s", var.environment, var.name, "evictions")
268+
alarm_description = "Redis evictions due to memory pressure"
269+
comparison_operator = "GreaterThanThreshold"
270+
evaluation_periods = "1"
271+
metric_name = "Evictions"
272+
namespace = "AWS/ElastiCache"
273+
period = "300"
274+
statistic = "Sum"
275+
threshold = var.alarm_eviction_threshold
276+
277+
dimensions = {
278+
CacheClusterId = var.num_cache_nodes > 1 ? aws_elasticache_replication_group.redis[count.index].id : aws_elasticache_cluster.redis[0].id
279+
}
280+
281+
alarm_actions = [aws_sns_topic.slack_topic[0].arn]
282+
ok_actions = [aws_sns_topic.slack_topic[0].arn]
283+
depends_on = [aws_sns_topic.slack_topic]
284+
285+
tags = merge(
286+
{ "Name" = format("%s-%s-%s", var.environment, var.name, "eviction_metric") },
287+
local.tags,
288+
)
289+
}
290+
291+
# Alarm for Connections
292+
resource "aws_cloudwatch_metric_alarm" "cache_connections" {
293+
count = var.cloudwatch_metric_alarms_enabled ? 1 : 0
294+
alarm_name = format("%s-%s-%s", var.environment, var.name, "connections")
295+
alarm_description = "Redis cluster number of client connections"
296+
comparison_operator = "GreaterThanThreshold"
297+
evaluation_periods = "1"
298+
metric_name = "CurrConnections"
299+
namespace = "AWS/ElastiCache"
300+
period = "300"
301+
statistic = "Average"
302+
threshold = var.alarm_connections_threshold
303+
304+
dimensions = {
305+
CacheClusterId = var.num_cache_nodes > 1 ? aws_elasticache_replication_group.redis[count.index].id : aws_elasticache_cluster.redis[0].id
306+
}
307+
308+
alarm_actions = [aws_sns_topic.slack_topic[0].arn]
309+
ok_actions = [aws_sns_topic.slack_topic[0].arn]
310+
depends_on = [aws_sns_topic.slack_topic]
311+
312+
tags = merge(
313+
{ "Name" = format("%s-%s-%s", var.environment, var.name, "connections_metric") },
314+
local.tags,
315+
)
316+
}
317+
318+
# Alarm for Replication Lag (if using replication)
319+
resource "aws_cloudwatch_metric_alarm" "cache_replication_lag" {
320+
count = var.cloudwatch_metric_alarms_enabled && var.num_cache_nodes > 1 ? 1 : 0
321+
alarm_name = format("%s-%s-%s", var.environment, var.name, "replication-lag")
322+
alarm_description = "Redis replication lag"
323+
comparison_operator = "GreaterThanThreshold"
324+
evaluation_periods = "1"
325+
metric_name = "ReplicationLag"
326+
namespace = "AWS/ElastiCache"
327+
period = "300"
328+
statistic = "Maximum"
329+
threshold = var.alarm_replication_lag_threshold
330+
331+
dimensions = {
332+
ReplicationGroupId = aws_elasticache_replication_group.redis[count.index].id
333+
}
334+
335+
alarm_actions = [aws_sns_topic.slack_topic[0].arn]
336+
ok_actions = [aws_sns_topic.slack_topic[0].arn]
337+
depends_on = [aws_sns_topic.slack_topic]
338+
339+
tags = merge(
340+
{ "Name" = format("%s-%s-%s", var.environment, var.name, "replication_lag_metric") },
341+
local.tags,
342+
)
343+
}
344+
345+
# Alarm for Cache Hits
346+
resource "aws_cloudwatch_metric_alarm" "cache_hits" {
347+
count = var.cloudwatch_metric_alarms_enabled ? 1 : 0
348+
alarm_name = format("%s-%s-%s", var.environment, var.name, "cache-hits")
349+
alarm_description = "Redis cache hits"
350+
comparison_operator = "LessThanThreshold"
351+
evaluation_periods = "1"
352+
metric_name = "CacheHits"
353+
namespace = "AWS/ElastiCache"
354+
period = "300"
355+
statistic = "Sum"
356+
threshold = var.alarm_cache_hits_threshold
357+
358+
dimensions = {
359+
CacheClusterId = var.num_cache_nodes > 1 ? aws_elasticache_replication_group.redis[count.index].id : aws_elasticache_cluster.redis[0].id
360+
}
361+
362+
alarm_actions = [aws_sns_topic.slack_topic[0].arn]
363+
ok_actions = [aws_sns_topic.slack_topic[0].arn]
364+
depends_on = [aws_sns_topic.slack_topic]
365+
366+
tags = merge(
367+
{ "Name" = format("%s-%s-%s", var.environment, var.name, "cache_hits_metric") },
368+
local.tags,
369+
)
370+
}
371+
372+
# Alarm for Cache Misses
373+
resource "aws_cloudwatch_metric_alarm" "cache_misses" {
374+
count = var.cloudwatch_metric_alarms_enabled ? 1 : 0
375+
alarm_name = format("%s-%s-%s", var.environment, var.name, "cache-misses")
376+
alarm_description = "Redis cache misses"
377+
comparison_operator = "GreaterThanThreshold"
378+
evaluation_periods = "1"
379+
metric_name = "CacheMisses"
380+
namespace = "AWS/ElastiCache"
381+
period = "300"
382+
statistic = "Sum"
383+
threshold = var.alarm_cache_misses_threshold
384+
385+
dimensions = {
386+
CacheClusterId = var.num_cache_nodes > 1 ? aws_elasticache_replication_group.redis[count.index].id : aws_elasticache_cluster.redis[0].id
387+
}
388+
389+
alarm_actions = [aws_sns_topic.slack_topic[0].arn]
390+
ok_actions = [aws_sns_topic.slack_topic[0].arn]
391+
depends_on = [aws_sns_topic.slack_topic]
392+
393+
tags = merge(
394+
{ "Name" = format("%s-%s-%s", var.environment, var.name, "cache_misses_metric") },
395+
local.tags,
396+
)
397+
}
398+
264399
resource "aws_kms_key" "this" {
265400
count = var.slack_notification_enabled ? 1 : 0
266401
description = "KMS key for notify-slack test"

variables.tf

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,37 @@ variable "alarm_cpu_threshold_percent" {
202202
description = "CPU threshold alarm level"
203203
}
204204

205+
variable "alarm_eviction_threshold" {
206+
type = number
207+
default = 20
208+
description = "Eviction threshold alarm level"
209+
}
210+
211+
variable "alarm_connections_threshold" {
212+
type = number
213+
default = 100 # Increased for typical traffic
214+
description = "Connections threshold alarm level"
215+
}
216+
217+
variable "alarm_replication_lag_threshold" {
218+
type = number
219+
default = 10 # Appropriate for typical usage
220+
description = "Replication lag threshold alarm level"
221+
}
222+
223+
variable "alarm_cache_hits_threshold" {
224+
type = number
225+
default = 1000 # Increased for typical usage
226+
description = "Cache hits threshold alarm level"
227+
}
228+
229+
variable "alarm_cache_misses_threshold" {
230+
type = number
231+
default = 50 # Increased to prevent alerts for occasional misses
232+
description = "Cache misses threshold alarm level"
233+
}
234+
235+
205236
variable "alarm_actions" {
206237
type = list(string)
207238
description = "Alarm action list"

0 commit comments

Comments
 (0)