import numpy as np
import matplotlib.pyplot as plt
from utils import *

%matplotlib inline

# Load the dataset
X_train, X_val, y_val = load_data()

# Display the first five elements of X_train
print("The first 5 elements of X_train are:\n", X_train[:5])

# Display the first five elements of X_val
print("The first 5 elements of X_val are\n", X_val[:5])

# Display the first five elements of y_val
print("The first 5 elements of y_val are\n", y_val[:5])

print ('The shape of X_train is:', X_train.shape)
print ('The shape of X_val is:', X_val.shape)
print ('The shape of y_val is: ', y_val.shape)

# Create a scatter plot of the data. To change the markers to blue "x",
# we used the 'marker' and 'c' parameters
plt.scatter(X_train[:, 0], X_train[:, 1], marker='x', c='b') 

# Set the title
plt.title("The first dataset")
# Set the y-axis label
plt.ylabel('Throughput (mb/s)')
# Set the x-axis label
plt.xlabel('Latency (ms)')
# Set axis range
plt.axis([0, 30, 0, 30])
plt.show()

# UNQ_C1
# GRADED FUNCTION: estimate_gaussian

def estimate_gaussian(X): 
    """
    Calculates mean and variance of all features 
    in the dataset
    
    Args:
        X (ndarray): (m, n) Data matrix
    
    Returns:
        mu (ndarray): (n,) Mean of all features
        var (ndarray): (n,) Variance of all features
    """

    m, n = X.shape
    
    ### START CODE HERE ### 
    
    ### END CODE HERE ### 
        
    return mu, var

def estimate_gaussian(X): 
   m, n = X.shape

   ### START CODE HERE ### 
   mu = # Your code here to calculate the mean of every feature
   var = # Your code here to calculate the variance of every feature 
   ### END CODE HERE ### 

   return mu, var
    ```

# Estimate mean and variance of each feature
mu, var = estimate_gaussian(X_train)              

print("Mean of each feature:", mu)
print("Variance of each feature:", var)
    
# UNIT TEST
from public_tests import *
estimate_gaussian_test(estimate_gaussian)

# Returns the density of the multivariate normal
# at each data point (row) of X_train
p = multivariate_gaussian(X_train, mu, var)

#Plotting code 
visualize_fit(X_train, mu, var)

# UNQ_C2
# GRADED FUNCTION: select_threshold

def select_threshold(y_val, p_val): 
    """
    Finds the best threshold to use for selecting outliers 
    based on the results from a validation set (p_val) 
    and the ground truth (y_val)
    
    Args:
        y_val (ndarray): Ground truth on validation set
        p_val (ndarray): Results on validation set
        
    Returns:
        epsilon (float): Threshold chosen 
        F1 (float):      F1 score by choosing epsilon as threshold
    """ 

    best_epsilon = 0
    best_F1 = 0
    F1 = 0
    
    step_size = (max(p_val) - min(p_val)) / 1000
    
    for epsilon in np.arange(min(p_val), max(p_val), step_size):
    
        ### START CODE HERE ### 
        
        ### END CODE HERE ### 
        
        if F1 > best_F1:
            best_F1 = F1
            best_epsilon = epsilon
        
    return best_epsilon, best_F1

def select_threshold(y_val, p_val): 
   best_epsilon = 0
   best_F1 = 0
   F1 = 0

   step_size = (max(p_val) - min(p_val)) / 1000

   for epsilon in np.arange(min(p_val), max(p_val), step_size):

       ### START CODE HERE ### 
       predictions = # Your code here to calculate predictions for each example using epsilon as threshold

       tp = # Your code here to calculate number of true positives
       fp = # Your code here to calculate number of false positives
       fn = # Your code here to calculate number of false negatives

       prec = # Your code here to calculate precision
       rec = # Your code here to calculate recall

       F1 = # Your code here to calculate F1
       ### END CODE HERE ### 

       if F1 > best_F1:
           best_F1 = F1
           best_epsilon = epsilon

   return best_epsilon, best_F1
    ```

p_val = multivariate_gaussian(X_val, mu, var)
epsilon, F1 = select_threshold(y_val, p_val)

print('Best epsilon found using cross-validation: %e' % epsilon)
print('Best F1 on Cross Validation Set: %f' % F1)
    
# UNIT TEST
select_threshold_test(select_threshold)

# Find the outliers in the training set 
outliers = p < epsilon

# Visualize the fit
visualize_fit(X_train, mu, var)

# Draw a red circle around those outliers
plt.plot(X_train[outliers, 0], X_train[outliers, 1], 'ro',
         markersize= 10,markerfacecolor='none', markeredgewidth=2)

# load the dataset
X_train_high, X_val_high, y_val_high = load_data_multi()

print ('The shape of X_train_high is:', X_train_high.shape)
print ('The shape of X_val_high is:', X_val_high.shape)
print ('The shape of y_val_high is: ', y_val_high.shape)

# Apply the same steps to the larger dataset

# Estimate the Gaussian parameters
mu_high, var_high = estimate_gaussian(X_train_high)

# Evaluate the probabilites for the training set
p_high = multivariate_gaussian(X_train_high, mu_high, var_high)

# Evaluate the probabilites for the cross validation set
p_val_high = multivariate_gaussian(X_val_high, mu_high, var_high)

# Find the best threshold
epsilon_high, F1_high = select_threshold(y_val_high, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)
print('Best F1 on Cross Validation Set:  %f'% F1_high)
print('# Anomalies found: %d'% sum(p_high < epsilon_high))

Mean of each feature:	[14.11222578 14.99771051]
Variance of each feature:	[1.83263141 1.70974533]

Best epsilon found using cross-validation:	8.99e-05
Best F1 on Cross Validation Set:	0.875

Best epsilon found using cross-validation:	1.38e-18
Best F1 on Cross Validation Set:	0.615385
# anomalies found:	117

Anomaly Detection¶

Outline¶

1 - Packages¶

2 - Anomaly detection¶

2.1 Problem Statement¶

2.2 Dataset¶

View the variables¶

Check the dimensions of your variables¶

Visualize your data¶

2.3 Gaussian distribution¶

2.2.1 Estimating parameters for a Gaussian¶

Exercise 1¶

2.2.2 Selecting the threshold $\epsilon$¶

Exercise 2¶

2.4 High dimensional dataset¶

Check the dimensions of your variables¶

Anomaly detection¶