Codice sorgente per qiskit_machine_learning.datasets.breast_cancer
# This code is part of Qiskit.
#
# (C) Copyright IBM 2018, 2022.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.
"""
breast cancer dataset
"""
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from qiskit.utils import optionals
from .dataset_helper import features_and_labels_transform
from ..deprecation import deprecate_function
[documenti]@deprecate_function("0.4.0")
def breast_cancer(training_size, test_size, n, plot_data=False, one_hot=True):
"""
Returns breast cancer dataset.
This function is deprecated in version 0.4.0
"""
class_labels = [r"A", r"B"]
data, target = datasets.load_breast_cancer(return_X_y=True)
sample_train, sample_test, label_train, label_test = train_test_split(
data, target, test_size=0.3, random_state=12
)
# Now we standardize for gaussian around 0 with unit variance
std_scale = StandardScaler().fit(sample_train)
sample_train = std_scale.transform(sample_train)
sample_test = std_scale.transform(sample_test)
# Now reduce number of features to number of qubits
pca = PCA(n_components=n).fit(sample_train)
sample_train = pca.transform(sample_train)
sample_test = pca.transform(sample_test)
# Scale to the range (-1,+1)
samples = np.append(sample_train, sample_test, axis=0)
minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
sample_train = minmax_scale.transform(sample_train)
sample_test = minmax_scale.transform(sample_test)
# Pick training size number of samples from each distro
training_input = {
key: (sample_train[label_train == k, :])[:training_size]
for k, key in enumerate(class_labels)
}
test_input = {
key: (sample_test[label_test == k, :])[:test_size] for k, key in enumerate(class_labels)
}
training_feature_array, training_label_array = features_and_labels_transform(
training_input, class_labels, one_hot
)
test_feature_array, test_label_array = features_and_labels_transform(
test_input, class_labels, one_hot
)
if plot_data:
optionals.HAS_MATPLOTLIB.require_now("breast_cancer")
# pylint: disable=import-error
import matplotlib.pyplot as plt
for k in range(0, 2):
plt.scatter(
sample_train[label_train == k, 0][:training_size],
sample_train[label_train == k, 1][:training_size],
)
plt.title("PCA dim. reduced Breast cancer dataset")
plt.show()
return (
training_feature_array,
training_label_array,
test_feature_array,
test_label_array,
)