Hsuan-Chun Lin

Hsuan-Chun Lin

Computational Biology | Biochemist | Machine learning | A happy self-learner

© 2024

Dark Mode

Note for Data Exploration and Features Selection

The full jupyter notebook is here.

Demo dataset

The following data is used to set up demos for the following data exploration templates.

  • RNA structure data
  MFE Structure
1 -0.50 ……(((…..)))……………
2 -0.50 ……(((…..)))……………
  • Sequence data
  Sequence 21A-1-6 21B-1-6 21C-1-6 MFEA MFEB MFEC
0 AAAAAA 1.657296 0.983233 1.307640 -3.4 -0.5 -8.6
1 AAAAAC 1.599794 1.014041 1.560623 -3.4 -0.5 -8.6
2 AAAAAG 1.335313 0.817613 0.355829 -3.4 -0.5 -8.6
3 AAAAAU 1.757485 1.396370 1.560339 -3.4 -0.5 -8.6

Label encoder template- use structure data

from sklearn.preprocessing import LabelEncoder

column_names = structure_table.columns
new_structure_table = structure_table.copy()
my_label_encoder = LabelEncoder()
classes_le = [] # list to store the labels
for col_name in column_names:
    new_structure_table[col_name] = my_label_encoder.fit_transform(structure_table[col_name])
    classes_le.append(my_label_encoder.classes_)
new_structure_table.head()

One-hot encoding- use sequence data

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(sequence_table))
OH_cols_train.columns = OH_encoder.get_feature_names()
OH_cols_train.head()

Data normalization

from sklearn.preprocessing import StandardScaler

# Separating out the features
temp_data = input_data.drop(['Sequence'], axis = 1).abs()
features = temp_data.columns
X = temp_data.values
# Separating out the target
#y = df.loc[:,['target']].values
# Standardizing the features
X = StandardScaler().fit_transform(X)

PCA analysis

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
#Setting parameters
components = 10
#generate PC Labels
pc_features = []
for i in np.arange(0,components):
    pc_features.append('PC'+str(i))
pca = PCA(n_components=components)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = pc_features)

#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])



#PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
#    svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_.cumsum())
#print(pca.singular_values_)

#finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf = principalDf.copy()
#Visualze the data
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
pc1 = finalDf.iloc[:,0]
pc2 = finalDf.iloc[:,1]
ax.scatter(pc1,pc2,s = 0.4)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
#ax.legend(df_mod.columns)
fig2 = plt.figure(figsize=(7,5))
ax2 = Axes3D(fig2, rect=[0, 0, .95, 1], elev=48, azim=134)
ax2.scatter(DataTable.PC1, DataTable.PC2, DataTable.PC3, edgecolor=None, s = 0.5)
ax2.set_xlabel("PC1")
ax2.set_ylabel("PC2")
ax2.set_zlabel("PC3")
#select the range of value
#finalDf[(finalDf.iloc[:,0] < -1) & (finalDf.iloc[:,1] < 0)]
fig3 = plt.figure(figsize=(7,5))
ax = fig3.add_subplot(111)
x_ = np.arange(1,components+1)
y_ = pca.explained_variance_ratio_.cumsum()
ax.scatter(x_,y_)
ax.set_xlabel("PCs")
ax.set_ylabel("Cumulative Explained Variance")

Feature selection

ExtraTreesClassifier

from sklearn.ensemble import ExtraTreesClassifier
number_labels = 3
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False)
X = input_data.drop(['Sequence','21C -1-6', '21A-1-6', '21B-1-6', 'MFEA'],axis = 1).abs()
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.show()

SelectKBest

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

number_labels = 3
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False)
X = input_data.drop(['Sequence','21B-1-6', '21A-1-6', '21C -1-6', 'MFEA'],axis = 1).abs()
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))  #print 10 best features

Grouped by k-means classifier

Perform k-means

from sklearn.cluster import KMeans

#Model preparation
estimators = [('k_means_5', KMeans(n_clusters=5)),
              ('k_means_3', KMeans(n_clusters=3)),
              ('k_means_bad_init', KMeans(n_clusters=3, n_init=1,
                                               init='random'))]
#Data preparation
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False)  #target column i.e price range
X = input_data.drop(['Sequence'],axis = 1).abs()

fignum = 1
titles = ['5 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(10, 8))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(DataTable.PC1, DataTable.PC2, DataTable.PC3,
               c=labels.astype(np.float), edgecolor=None)

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    ax.set_title(titles[fignum - 1])
    ax.dist = 12
    fignum = fignum + 1
fig.show()

Attach k-means labels to your pandas DataFrame

# Here we use 5 class first
estimator = KMeans(n_clusters=5)
estimator.fit(X)
result = DataTable.copy()
result['classes'] = estimator.labels_

If you are more interested about RNA….

Sequence: CCCUCUGGCCUUAAGUCUAACAUGAA-NNNNNN In structure data, position 26 is position 0 at random sequence, so do 27 and 1, 28 and 2. By my analysis, if we have secondary structure at protein binding site 26 to 31 (starts from 0) with specific nucleotide on those two position, we still get high affinity and this provide a novel way to test if secondary structure with specific nucleotide may improve the RNA-protein interaction which is different from what we have know that RNA secondary structure hindered the interaction to protein.