resource "aws_cloudwatch_metric_alarm" "app_beanstalk_environment_health" {
  alarm_name = "${var.service_name}-${var.environment}-app_beanstalk_environment_health"

  alarm_description   = "Monitors the health of ${var.service_name} Elastic Beanstalk API"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 2
  metric_name         = "EnvironmentHealth"
  namespace           = "AWS/ElasticBeanstalk"
  period              = 60
  statistic           = "Maximum"
  threshold           = 20 // Degraded

  dimensions = {
    EnvironmentName = var.elastic_beanstalk_environment_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the average p90 latency of the environment crosses 1s for 20 minutes
resource "aws_cloudwatch_metric_alarm" "app_beanstalk_avg_latency" {
  alarm_name = "${var.service_name}-${var.environment}-avg_latency"

  alarm_description   = "Monitors the average p90 latency of ${var.service_name} Elastic Beanstalk API"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 20
  metric_name         = "ApplicationLatencyP90"
  namespace           = "AWS/ElasticBeanstalk"
  period              = 60
  statistic           = "Average"
  threshold           = 500

  dimensions = {
    EnvironmentName = var.elastic_beanstalk_environment_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the average CPU utilization of the autoscaling group crosses 80% for 20 minutes
resource "aws_cloudwatch_metric_alarm" "app_asg_cpu" {
  alarm_name = "${var.service_name}-${var.environment}-asg-avg_cpu"

  alarm_description   = "Monitors the average CPU utilization of ${var.service_name} Elastic Beanstalk Auto Scaling Group"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 20
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period              = 60
  statistic           = "Average"
  threshold           = 80

  dimensions = {
    AutoScalingGroupName = var.elastic_beanstalk_asg_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the amount of 5XX backend errors reaches 500 for 5 minutes
resource "aws_cloudwatch_metric_alarm" "app_elb_backend_5XX" {
  alarm_name = "${var.service_name}-${var.environment}-backend_5XX"

  alarm_description   = "Monitors the number of backend 5XX responses from ${var.service_name} Elastic Beanstalk API"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 5
  metric_name         = "HTTPCode_Backend_5XX"
  namespace           = "AWS/ELB"
  period              = 60
  statistic           = "Sum"
  threshold           = 1000

  dimensions = {
    LoadBalancerName = var.elastic_beanstalk_loadbalancer_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the amount of spillover from the surge queue reaches 2500 for 2 minutes
resource "aws_cloudwatch_metric_alarm" "app_elb_spillover" {
  alarm_name = "${var.service_name}-${var.environment}-spillover"

  alarm_description   = "Monitors the number of requests spilled from the surge queue from ${var.service_name} Elastic Beanstalk load balancer"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 2
  metric_name         = "SpilloverCount"
  namespace           = "AWS/ELB"
  period              = 60
  statistic           = "Sum"
  threshold           = 2500

  dimensions = {
    LoadBalancerName = var.elastic_beanstalk_loadbalancer_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the amount of 5XX errors returned from the ELB reaches 500 for 2 minutes
resource "aws_cloudwatch_metric_alarm" "app_elb_5XX" {
  alarm_name = "${var.service_name}-${var.environment}-elb_5xx"

  alarm_description   = "Monitors the number of 5XX responses from ${var.service_name} Elastic Beanstalk load balancer"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 5
  metric_name         = "HTTPCode_ELB_5XX"
  namespace           = "AWS/ELB"
  period              = 60
  statistic           = "Sum"
  threshold           = 1000

  dimensions = {
    LoadBalancerName = var.elastic_beanstalk_loadbalancer_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

# Alarms when the latency of the ELB to instances is greater than 500ms for 20 minutes
resource "aws_cloudwatch_metric_alarm" "app_elb_latency" {
  alarm_name = "${var.service_name}-${var.environment}-elb_latency"

  alarm_description   = "Monitors the latency from ${var.service_name} Elastic Beanstalk load balancer"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 10
  metric_name         = "Latency"
  namespace           = "AWS/ELB"
  period              = 60
  statistic           = "Average"
  threshold           = 500

  dimensions = {
    LoadBalancerName = var.elastic_beanstalk_loadbalancer_name
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "cache_evictions" {
  alarm_name = "${var.service_name}-${var.environment}-cache_evictions"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 2
  metric_name         = "Evictions"
  namespace           = "AWS/ElastiCache"
  period              = 60
  statistic           = "Sum"
  threshold           = 100

  dimensions = {
    CacheClusterId = var.elasticache_cluster_id
  }

  alarm_actions      = [aws_sns_topic.pagerduty_sns_topic.arn]
  ok_actions         = [aws_sns_topic.pagerduty_sns_topic.arn]
  treat_missing_data = "notBreaching"
}
