Data Science

Basic Models

Required files:

test_plan.py

#!/usr/bin/env python
"""
This example shows how to display various data modelling techniques and their
associated statistics in Testplan. The models used are:

* linear regression
* classification
* clustering
"""
import os
import sys
import random

from testplan import test_plan
from testplan.testing.multitest import MultiTest
from testplan.testing.multitest.suite import testsuite, testcase
from testplan.report.testing.styles import Style

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.cluster import KMeans
from sklearn import datasets, linear_model, svm

import matplotlib

matplotlib.use("agg")
import matplotlib.pyplot as plot
import numpy as np


def create_scatter_plot(title, x, y, x_label, y_label, c=None):
    plot.scatter(x, y, c=c)
    plot.grid()
    plot.xlabel(x_label)
    plot.ylabel(y_label)
    plot.title(title)


def create_image_plot(title, img_data, rows, columns, index):
    plot.subplot(rows, columns, index)
    plot.axis("off")
    plot.imshow(img_data, cmap=plot.cm.gray_r, interpolation="nearest")
    plot.title(title)


@testsuite
class ModelExamplesSuite(object):
    @testcase
    def basic_linear_regression(self, env, result):
        """
        This example was based on:
        http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html.
        Train a basic linear regression model on BMI and the likelihood of
        diabetes. In the report, record:
         * coefficients
         * mean squared error
         * variance
         * plot of the regression with the test data
        """
        # Gather and separate the data into features (X) and results (y). We are
        # only using the BMI feature to compare against likelihood of diabetes.
        diabetes = datasets.load_diabetes()
        X = diabetes.data[:, np.newaxis, 2]
        y = diabetes.target
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2
        )

        # Train the linear regression model and make predictions.
        regr = linear_model.LinearRegression()
        regr.fit(X_train, y_train)
        diabetes_y_pred = regr.predict(X_test)

        # Log the statistics to the report.
        mse = mean_squared_error(y_test, diabetes_y_pred)
        r2 = r2_score(y_test, diabetes_y_pred)
        result.log("Coefficients: {}".format(regr.coef_))
        result.log("Mean squared error: {0:.2f}".format(mse))
        result.log("Variance: {0:.2f}".format(r2))

        # Plot the predictions and display this plot on the report.
        create_scatter_plot(
            "Basic Linear Regression Example",
            X_test,
            y_test,
            "BMI (kg/m^2)",
            "Likelihood of diabetes",
            "black",
        )
        plot.plot(X_test, diabetes_y_pred, color="blue", linewidth=3)
        result.matplot(plot)

    @testcase
    def basic_classifier(self, env, result):
        """
        This example was based on:
        http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#sphx-glr-auto-examples-classification-plot-digits-classification-py.
        Train a basic classifier to classify hand drawn numbers. In the report,
        record:
         * precision per class
         * recall per class
         * f1-score per class
         * support per class
         * some example images with the predicted and actual classes
        """
        # Gather and split the data into features (X) and results (y). We
        # reshape each of the digit images from an 8x8 array into a 64x1 array.
        digits = datasets.load_digits()
        n_samples = len(digits.images)
        data = digits.images.reshape((n_samples, -1))
        X_train, X_test, y_train, y_test = train_test_split(
            data, digits.target, test_size=0.2
        )

        # Train the classifier and make predictions.
        classifier = svm.SVC(gamma=0.001)
        classifier.fit(X_train, y_train)
        predicted = classifier.predict(X_test)

        # Log the precision, recall, f1 and supports statistics (within the
        # classification report) to the report. Show four range images from the
        # test set with their predictions and actual values.
        result.log(classification_report(y_test, predicted))
        for i, sample in enumerate(random.sample(range(0, len(y_test)), 3)):
            t = "Prediction: {}\nActual: {}".format(
                predicted[sample], y_test[sample]
            )
            create_image_plot(t, X_test[sample].reshape((8, 8)), 1, 3, i + 1)
        result.matplot(plot, 4, 3)

    @testcase
    def basic_k_means_cluster(self, env, result):
        """
        Train a basic k means cluster on some randomly generated blobs of data.
        In the report, record:
         * the number of clusters
         * the plot of the clusters
        """
        # Create random data blobs and train a K-Means cluster to group this
        # data into 3 clusters.
        n_clusters = 3
        random_state = 100
        X, y = datasets.make_blobs(n_samples=1500, random_state=random_state)
        clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
        y_pred = clusterer.fit_predict(X)

        # Log the number of clusters and plot the clustered data.
        result.log("Number of clusters: {}".format(n_clusters))
        create_scatter_plot(
            "Basic K-Means Cluster Example",
            X[:, 0],
            X[:, 1],
            "Feature 1",
            "Feature 2",
            c=y_pred,
        )
        result.matplot(plot)


# Hard-coding `pdf_path` and 'pdf_style' so that the downloadable example gives
# meaningful and presentable output. NOTE: this programmatic arguments passing
# approach will cause Testplan to ignore any command line arguments related to
# that functionality.
@test_plan(
    name="Basic Data Modelling Example",
    pdf_path=os.path.join(os.path.dirname(__file__), "report.pdf"),
    pdf_style=Style(passing="assertion-detail", failing="assertion-detail"),
)
def main(plan):
    """
    Testplan decorated main function to add and execute MultiTests.

    :return: Testplan result object.
    :rtype:  :py:class:`~testplan.base.TestplanResult`
    """
    model_examples = MultiTest(
        name="Model Examples", suites=[ModelExamplesSuite()]
    )
    plan.add(model_examples)


if __name__ == "__main__":
    sys.exit(not main())

PDF report

Sample first page.

../_images/basic_data_science.png

Overfitting

Required files:

test_plan.py

#!/usr/bin/env python
# This plan contains tests that demonstrate failures as well.
"""
This example shows how to display various data modelling techniques and their
associated statistics in Testplan. The models used are:

* linear regression
* classification
* clustering
"""
import os
import sys

from testplan import test_plan
from testplan.testing.multitest import MultiTest
from testplan.testing.multitest.suite import testsuite, testcase
from testplan.report.testing.styles import Style
from testplan.common.utils.timing import Timer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import matplotlib

matplotlib.use("agg")
import matplotlib.pyplot as plot
import numpy as np


# Create a Matplotlib scatter plot.
def create_scatter_plot(title, x, y, label, c=None):
    plot.scatter(x, y, c=c, label=label)
    plot.grid()
    plot.xlabel("x")
    plot.ylabel("y")
    plot.xlim((0, 1))
    plot.ylim((-2, 2))
    plot.title(title)


# Use the original docstring, formatting
# it using kwargs via string interpolation.

# e.g. `foo: {foo}, bar: {bar}`.format(foo=2, bar=5)` -> 'foo: 2, bar: 5'
def interpolate_docstring(docstring, kwargs):
    return docstring.format(**kwargs)


@testsuite
class ModelExamplesSuite(object):
    def setup(self, env, result):
        """
        Load the raw data from the CSV file.
        Log this data as a table in the report.
        """
        # Load the raw cosine data from the CSV file.

        self.x, self.y = np.loadtxt(
            os.path.join(os.path.dirname(__file__), "cos_data.csv"),
            delimiter=",",
            unpack=True,
            skiprows=1,
        )
        self.x_test = np.linspace(0, 1, 100)

        # Log it to display in the report, this will show the first 5 and last 5
        # rows if there are more than 10 rows.
        data = [["X", "y"]] + [
            [self.x[i], self.y[i]] for i in range(len(self.x))
        ]
        result.table.log(data, description="Raw cosine data")

    @testcase(
        parameters={"degrees": [2, 3, 4, 5, 10, 15]},
        docstring_func=interpolate_docstring,
    )
    def polynomial_regression(self, env, result, degrees):
        """
        Create and train a polynomial regression function with {degrees} degrees
        of freedom. Check if the Mean Square Error (MSE) and time to train the
        model are within their thresholds. Display the train data and the model
        on a plot.
        """
        # This example was based on
        # http://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html

        # Create the pipeline to train a polynomial regression with varying
        # degrees of freedom.
        polynomial_features = PolynomialFeatures(
            degree=degrees, include_bias=False
        )
        pipeline = Pipeline(
            [
                ("polynomial_features", polynomial_features),
                ("linear_regression", LinearRegression()),
            ]
        )

        # Train the model and record how long this takes.
        timer = Timer()
        with timer.record("train_model"):
            pipeline.fit(self.x[:, np.newaxis], self.y)
        scores = cross_val_score(
            pipeline,
            self.x[:, np.newaxis],
            self.y,
            scoring="neg_mean_squared_error",
            cv=10,
        )

        # Check the Mean Square Error (MSE) and time to train the model are
        # within their thresholds.
        result.less(
            -scores.mean(),
            0.05,
            description="Mean Square Error threshold on test data",
        )
        result.less(
            timer["train_model"].elapsed,
            1,
            description="How long did the model take to train?",
        )

        # Display the train data and the model on a plot.
        create_scatter_plot(
            title="{} degrees of freedom model & Train data".format(degrees),
            x=self.x,
            y=self.y,
            label="Samples",
            c="black",
        )
        y_test = pipeline.predict(self.x_test[:, np.newaxis])
        plot.plot(self.x_test, y_test, label="Model")
        plot.legend(loc="best")
        result.matplot(plot)


# Hard-coding `pdf_path` and 'pdf_style' so that the downloadable example gives
# meaningful and presentable output. NOTE: this programmatic arguments passing
# approach will cause Testplan to ignore any command line arguments related to
# that functionality.
@test_plan(
    name="Basic Data Modelling Example",
    pdf_path=os.path.join(os.path.dirname(__file__), "report.pdf"),
    pdf_style=Style(passing="assertion-detail", failing="assertion-detail"),
)
def main(plan):
    """
    Testplan decorated main function to add and execute MultiTests.

    :return: Testplan result object.
    :rtype:  :py:class:`~testplan.base.TestplanResult`
    """
    model_examples = MultiTest(
        name="Model Examples", suites=[ModelExamplesSuite()]
    )
    plan.add(model_examples)


if __name__ == "__main__":
    sys.exit(not main())

cos_data.csv

X,y
1.878980043635514185e-02,8.254554463594766522e-01
2.021839744032571939e-02,1.190542123094355809e+00
6.022547162926983333e-02,9.090314946606257163e-01
7.103605819788694209e-02,9.006851940779262433e-01
8.712929970154070780e-02,7.916070845603220274e-01
1.182744258689332195e-01,9.264061152546703148e-01
1.289262976548533057e-01,6.596588035740591494e-01
1.433532874090464038e-01,7.590975834454046778e-01
2.103825610738409013e-01,4.579684965884241454e-01
2.645556121046269693e-01,3.571530123984236194e-01
3.154283509241838646e-01,3.319464989659558912e-02
3.595079005737860101e-01,-2.410954388528686043e-01
3.637107709426226076e-01,-1.454801195509775880e-01
3.834415188257777052e-01,-1.911078724170920673e-01
4.146619399905235870e-01,-3.672868255522013792e-01
4.236547993389047084e-01,-3.826473562137094331e-01
4.370319537993414549e-01,-5.328826217219828631e-01
4.375872112626925103e-01,-5.080332597798727923e-01
4.561503322165485486e-01,-6.142335792185962262e-01
4.614793622529318462e-01,-6.037908408210926892e-01
5.218483217500716753e-01,-8.573505437373947213e-01
5.288949197529044799e-01,-9.691753605077865208e-01
5.448831829968968643e-01,-8.220522052993319839e-01
5.488135039273247529e-01,-8.898835354885372695e-01
5.680445610939323098e-01,-1.056955394776426083e+00
5.684339488686485087e-01,-8.484782439645164320e-01
6.027633760716438749e-01,-1.045729653474798848e+00
6.120957227224214092e-01,-9.619217724657304069e-01
6.169339968747569181e-01,-8.997541785724504360e-01
6.176354970758770602e-01,-9.605272878238070300e-01
6.399210213275238202e-01,-8.781279178485223991e-01
6.458941130666561170e-01,-1.118695341735484572e+00
6.667667154456676792e-01,-9.597657247408495351e-01
6.706378696181594101e-01,-1.068305910056648100e+00
6.818202991034834071e-01,-1.084531117694096158e+00
6.976311959272648577e-01,-1.047257970376244129e+00
7.151893663724194772e-01,-1.005126754505744513e+00
7.742336894342166653e-01,-8.686382664494641803e-01
7.781567509498504842e-01,-9.816460728182452300e-01
7.805291762864554617e-01,-7.693873146907472815e-01
7.917250380826645895e-01,-7.847505168286996735e-01
7.991585642167235992e-01,-9.649656670238975220e-01
8.326198455479379978e-01,-5.606550175815971926e-01
8.700121482468191614e-01,-3.853694966648019138e-01
8.917730007820797722e-01,-3.703060260261964443e-01
9.255966382926610336e-01,-3.614703651112560756e-01
9.437480785146241669e-01,-3.690626391397222594e-01
9.446689170495838894e-01,-1.523519693038673517e-01
9.636627605010292807e-01,-2.107173155867676784e-01
9.786183422327640047e-01,2.165622103369364837e-02

PDF report

Sample first page.

../_images/overfitting.png