The full jupyter notebook is here.
Demo dataset
The following data is used to set up demos for the following data exploration templates.
- RNA structure data
MFE | Structure | |
---|---|---|
1 | -0.50 | ……(((…..)))…………… |
2 | -0.50 | ……(((…..)))…………… |
- Sequence data
Sequence | 21A-1-6 | 21B-1-6 | 21C-1-6 | MFEA | MFEB | MFEC | |
---|---|---|---|---|---|---|---|
0 | AAAAAA | 1.657296 | 0.983233 | 1.307640 | -3.4 | -0.5 | -8.6 |
1 | AAAAAC | 1.599794 | 1.014041 | 1.560623 | -3.4 | -0.5 | -8.6 |
2 | AAAAAG | 1.335313 | 0.817613 | 0.355829 | -3.4 | -0.5 | -8.6 |
3 | AAAAAU | 1.757485 | 1.396370 | 1.560339 | -3.4 | -0.5 | -8.6 |
Label encoder template- use structure data
from sklearn.preprocessing import LabelEncoder
column_names = structure_table.columns
new_structure_table = structure_table.copy()
my_label_encoder = LabelEncoder()
classes_le = [] # list to store the labels
for col_name in column_names:
new_structure_table[col_name] = my_label_encoder.fit_transform(structure_table[col_name])
classes_le.append(my_label_encoder.classes_)
new_structure_table.head()
One-hot encoding- use sequence data
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(sequence_table))
OH_cols_train.columns = OH_encoder.get_feature_names()
OH_cols_train.head()
Data normalization
from sklearn.preprocessing import StandardScaler
# Separating out the features
temp_data = input_data.drop(['Sequence'], axis = 1).abs()
features = temp_data.columns
X = temp_data.values
# Separating out the target
#y = df.loc[:,['target']].values
# Standardizing the features
X = StandardScaler().fit_transform(X)
PCA analysis
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
#Setting parameters
components = 10
#generate PC Labels
pc_features = []
for i in np.arange(0,components):
pc_features.append('PC'+str(i))
pca = PCA(n_components=components)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = pc_features)
#principalDf = pd.DataFrame(data = principalComponents
# , columns = ['principal component 1', 'principal component 2'])
#PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
# svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_.cumsum())
#print(pca.singular_values_)
#finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf = principalDf.copy()
#Visualze the data
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
pc1 = finalDf.iloc[:,0]
pc2 = finalDf.iloc[:,1]
ax.scatter(pc1,pc2,s = 0.4)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
#ax.legend(df_mod.columns)
fig2 = plt.figure(figsize=(7,5))
ax2 = Axes3D(fig2, rect=[0, 0, .95, 1], elev=48, azim=134)
ax2.scatter(DataTable.PC1, DataTable.PC2, DataTable.PC3, edgecolor=None, s = 0.5)
ax2.set_xlabel("PC1")
ax2.set_ylabel("PC2")
ax2.set_zlabel("PC3")
#select the range of value
#finalDf[(finalDf.iloc[:,0] < -1) & (finalDf.iloc[:,1] < 0)]
fig3 = plt.figure(figsize=(7,5))
ax = fig3.add_subplot(111)
x_ = np.arange(1,components+1)
y_ = pca.explained_variance_ratio_.cumsum()
ax.scatter(x_,y_)
ax.set_xlabel("PCs")
ax.set_ylabel("Cumulative Explained Variance")
Feature selection
ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier
number_labels = 3
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False)
X = input_data.drop(['Sequence','21C -1-6', '21A-1-6', '21B-1-6', 'MFEA'],axis = 1).abs()
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.show()
SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
number_labels = 3
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False)
X = input_data.drop(['Sequence','21B-1-6', '21A-1-6', '21C -1-6', 'MFEA'],axis = 1).abs()
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(15,'Score')) #print 10 best features
Grouped by k-means classifier
Perform k-means
from sklearn.cluster import KMeans
#Model preparation
estimators = [('k_means_5', KMeans(n_clusters=5)),
('k_means_3', KMeans(n_clusters=3)),
('k_means_bad_init', KMeans(n_clusters=3, n_init=1,
init='random'))]
#Data preparation
y = pd.cut(input_data.loc[:,'21B-1-6'], bins=number_labels, labels=np.arange(number_labels), right=False) #target column i.e price range
X = input_data.drop(['Sequence'],axis = 1).abs()
fignum = 1
titles = ['5 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
fig = plt.figure(fignum, figsize=(10, 8))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
est.fit(X)
labels = est.labels_
ax.scatter(DataTable.PC1, DataTable.PC2, DataTable.PC3,
c=labels.astype(np.float), edgecolor=None)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title(titles[fignum - 1])
ax.dist = 12
fignum = fignum + 1
fig.show()
Attach k-means labels to your pandas DataFrame
# Here we use 5 class first
estimator = KMeans(n_clusters=5)
estimator.fit(X)
result = DataTable.copy()
result['classes'] = estimator.labels_
If you are more interested about RNA….
Sequence: CCCUCUGGCCUUAAGUCUAACAUGAA-NNNNNN In structure data, position 26 is position 0 at random sequence, so do 27 and 1, 28 and 2. By my analysis, if we have secondary structure at protein binding site 26 to 31 (starts from 0) with specific nucleotide on those two position, we still get high affinity and this provide a novel way to test if secondary structure with specific nucleotide may improve the RNA-protein interaction which is different from what we have know that RNA secondary structure hindered the interaction to protein.