In [1]:
""" @author: Maxim Ziatdinov (e-mail: ziatdinovmax@gmail.com) """

#Import modules

# data manipulation and plotting
import numpy as np
import pandas as pd
from collections import OrderedDict
import itertools
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# save & load files
import os
import h5py
import glob

# Machine learning
from keras.layers import Input, Convolution2D, MaxPooling2D, UpSampling2D
from keras.layers import Dropout, Activation, Reshape
from keras.models import Model, load_model
from keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#Work with images and atomic coordinates
import cv2
from skimage.feature import blob_log
from scipy import ndimage
import scipy.spatial as spatial

# Work with graphs
import networkx as nx

np.random.seed(7)
%matplotlib inline
Using TensorFlow backend.

Part I: Constructing and Training Convolutional Neural Network Model

Define architectures of fully convolutional neural networks (FCNs)

We found that generally having separate FCN models tailored to find atoms in the lattice and specific defects allows more flexibility when it comes to the analysis of composite defects compared to a single FCN model that is trained to find probabilities for all the classes (lattice + defects) simultaneously.

In [2]:
def model_lattice(input_img, nb_classes = 2):
    x = Convolution2D(8, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Convolution2D(16, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Convolution2D(32, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    
    x = Convolution2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Convolution2D(16, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Convolution2D(8, (3, 3), activation='relu', padding='same')(x) 
    x = UpSampling2D((2, 2))(x)

    x = Convolution2D(nb_classes, (3, 3), activation = 'linear', padding='same')(x)
    x = Convolution2D(nb_classes, (1, 1), activation = 'linear', padding='same')(x)
    x = Reshape((target_size[0] * target_size[1], nb_classes))(x)
    output = Activation('softmax')(x)
    
    return Model(input_img, output)

def model_vacancy(input_img, nb_classes = 2):
    x = Convolution2D(8, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Convolution2D(16, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    
    x = Convolution2D(16, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Convolution2D(8, (3, 3), activation='relu', padding='same')(x) 
    x = UpSampling2D((2, 2))(x)

    x = Convolution2D(nb_classes, (3, 3), activation = 'linear', padding='same')(x)
    x = Convolution2D(nb_classes, (1, 1), activation = 'linear', padding='same')(x)
    x = Reshape((target_size[0] * target_size[1], nb_classes))(x)
    output = Activation('softmax')(x)
    
    return Model(input_img, output)

def model_dopant(input_img, nb_classes = 2):
    x = Convolution2D(2, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Convolution2D(4, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    
    x = Convolution2D(4, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Convolution2D(2, (3, 3), activation='relu', padding='same')(x) 
    x = UpSampling2D((2, 2))(x)

    x = Convolution2D(nb_classes, (3, 3), activation = 'linear', padding='same')(x)
    x = Convolution2D(nb_classes, (1, 1), activation = 'linear', padding='same')(x)
    x = Reshape((target_size[0] * target_size[1], nb_classes))(x)
    output = Activation('softmax')(x)
    
    return Model(input_img, output)

Load training data

In our case the training data represents stack(s) of 2D images stored as ndarray(s). Note that in case of a fully convolutional neural network image "labels" are themselves 2D images (aka ground truth). The purpose of this step is to make the format of loaded training data compatible with tensorflow format.

In [3]:
# Preprocessing for training images
def preprocessing_X(image_data, image_size):
    image_data = image_data.reshape(image_data.shape[0], image_size[0], image_size[1], 1)
    image_data = image_data.astype('float32')
    image_data = (image_data - np.amin(image_data))/(np.amax(image_data) - np.amin(image_data))
    return image_data

# preprocessing for "labels" (ground truth)
def preprocessing_Y(image_data, image_size):
    n_images = 0
    label = np.array([])
    for idx in range(image_data.shape[0]):
        img = image_data[idx,:,:]
        n, m = img.shape
        img = np.array(OneHotEncoder(n_values=nb_classes).fit_transform(img.reshape(-1,1)).todense())
        img = img.reshape(n, m, nb_classes)
        label = np.append(label, img)
        n_images += 1
    label_4D = label.reshape(n_images, image_size[0], image_size[1], nb_classes)    
    return label_4D
In [4]:
# load training images and corresponding "labels"
training_images_path = 'Graphene/DL_paper/Training/Images/Lattice.npy'
labels_path = 'Graphene/DL_paper/Training/Labels/Lattice.npy'

X = np.load(training_images_path)
y = np.load(labels_path)
print("Training data loaded.", "\nno. of images in the training set:", str(X.shape[0]),
      "\nresolution of each image:", str((X.shape[1], X.shape[2])))
Training data loaded. 
no. of images in the training set: 1546 
resolution of each image: (128, 128)
In [5]:
# Split into train/test and make the shapes of tensors compatible with tensorflow format
nb_classes = 2
target_size = (128, 128)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train = preprocessing_X(X_train, target_size)
X_test = preprocessing_X(X_test, target_size)
y_train = preprocessing_Y(y_train, target_size)
y_test = preprocessing_Y(y_test, target_size)
y_train = y_train.reshape(y_train.shape[0], target_size[0]*target_size[1], y_train.shape[3])
y_test = y_test.reshape(y_test.shape[0], target_size[0]*target_size[1], y_test.shape[3])

print("The dataset is now split into train and test sets, and is ready for model training.")
The dataset is now split into train and test sets, and is ready for model training.

Compile and train a model

We are now going to train a model for finding atoms in graphene lattice. The other models (e.g. for "dopants"/"adatoms" and "vacancies") can be trained simply by changing the name of a model on line 4.

In [6]:
nb_classes = 2
input_img = Input(shape=(target_size[0], target_size[1], 1)) 

model = model_lattice(input_img)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
callback_tb = TensorBoard(log_dir='/tmp/Graphene_FCN_lattice', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(X_train, y_train, epochs=100, batch_size=32, 
          validation_data=(X_test, y_test), shuffle=True, verbose = 0)

model.save("Graphene/DL_paper/Models/Graphene_FCN_lattice.h5")
model.save_weights("Graphene/DL_paper/Models/Graphene_FCN_lattice_weights.h5")
print('Saved model and weights to disk.\n')

print("Training history")
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1, 2, 1)
plt.plot(model.history.history['loss'])
ax1.set_title('Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
#ax1.set_yscale('log')
ax2 = fig.add_subplot(1, 2, 2)
plt.plot(model.history.history['val_loss'])
ax2.set_title('Test Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
#ax2.set_yscale('log')
Saved model and weights to disk.

Training history
Out[6]:
<matplotlib.text.Text at 0x2690a46b780>

Part II: Applying a Trained Model to Experimental Data for Extracting Coordinates of Atomic Species and/or Defects

Load experimental data

We load the experimental STEM images for which we want to extract positions of all the atomic species and/or defects.

In [2]:
# Loading experimental images and making them compatible with tensorflow format

def preprocessing(image_data, image_size):
    image_data = image_data.reshape(image_data.shape[0], image_size[0], image_size[1], 1)
    image_data = image_data.astype('float32')
    image_data = (image_data - np.amin(image_data))/(np.amax(image_data) - np.amin(image_data))
    return image_data

def load_data(filepath, image_size):
    structure = np.array([])
    filename = []
    n_images = 0
    for img in glob.glob(filepath):
        filename.append(os.path.splitext(os.path.basename(img))[0])
        structure_i = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
        structure_i = cv2.resize(structure_i, image_size, interpolation = cv2.INTER_AREA)
        structure = np.append(structure, structure_i)
        n_images += 1
    structure = structure.reshape(n_images, image_size[0], image_size[1])
    print("Experimental data loaded from", filepath, "\nno. of images:", str(n_images))
    return structure, filename

target_size = (128,128)
data_dir = "Graphene/DL_paper/STEM_real/Examples/"
STEM_real, filenames = load_data(os.path.join(data_dir,"*.png"), target_size)
STEM_real = preprocessing(STEM_real, target_size)
Experimental data loaded from Graphene/DL_paper/STEM_real/Examples/*.png 
no. of images: 7

Load FCN model(s) and apply them to real STEM/STM images

Loading trained FCN model(s)

In [3]:
# Loading models and obtaining softmax output (pixel-wise predictions)
def get_decoded_imgs(input_imgs, filepath, nb_channels = 2):
    model = load_model(filepath)
    decoded_imgs = model.predict(input_imgs)
    decoded_imgs = decoded_imgs.reshape(input_imgs.shape[0], target_size[0], target_size[1], nb_channels)
    print("FCN output obtained\n")
    return decoded_imgs

decoded_imgs = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_Lattice_AF0AF1.h5')   
FCN output obtained

Extract coordinates of lattice atoms

We now extract coordinates of atomic species and/or defects via Laplacian of Gaussian (LoG) blob detection technique (alternatively, one may also use center of mass (CoM) blob detection technique, which we also included here). The extracted data is then stored in a dictionary format together with the original experimental image and softmax output.

In [4]:
# Extract coordinates via LoG
def get_coordinates(input_imgs, decoded_imgs, channel = 1, method = 'LoG',
                    min_sigma = 1.5, max_sigma = 10, threshold = 0.8, dist_edge = 3):
    d_coord = {}
    for i in range(STEM_real.shape[0]):
        input_img = input_imgs[i,:,:,0]
        decoded_img = decoded_imgs[i,:,:,channel]
        _, decoded_img_c = cv2.threshold(decoded_img, threshold, 1, cv2.THRESH_BINARY)
        decoded_img_c = decoded_img_c[dist_edge:decoded_img_c.shape[0]-dist_edge,
                        dist_edge:decoded_img_c.shape[0]-dist_edge] # do not consider atoms very close to edges
        if method == 'LoG':
            coord = blob_log(decoded_img_c, min_sigma=min_sigma, max_sigma=max_sigma)
            coord = np.transpose(coord)[0:2,:]
        elif method == 'CoM':
            labels, nlabels = ndimage.label(decoded_img_c)
            coord = np.array(ndimage.center_of_mass(decoded_img_c, labels, np.arange(nlabels)+1))
            coord = np.transpose(coord.reshape(coord.shape[0], 2))
        coord = coord + dist_edge 
        dictionary = OrderedDict()
        dictionary['filename'] = filenames[i]
        dictionary['Experimental_image_{0}'.format(i)] = input_img
        dictionary['FCN_output_{0}'.format(i)] = decoded_img
        dictionary['coordinates_{0}'.format(i)] = coord
        d_coord[i] = dictionary
    print("Atomic/defect coordinates extracted.\n")
    return d_coord

# Save the obtained dictionary
def save_coord(filepath, d_coord):
    np.save(os.path.join(filepath + '.npy'), d_coord)
    print("Atomic/defect coordinates saved to disk.\n")

atomic_coord = get_coordinates(STEM_real, decoded_imgs, channel = 1)
save_coord("Graphene/DL_paper/DL_output/lattice_coord", atomic_coord)
Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting results

Plotting the original image, FCN output and extracted lattice coordinates.
Note that we did not specifically train our model to find non-hexagonal reconstructions of the lattice (see e.g. image 4 and image 5) - the model "figured it out" by itself. In some (limited) scenarious, a model may confuse a "dopant"/"adatom" with lattice atom. This can be addressed later during a refinement stage.

In [5]:
# Make plots
def make_plots(d_coord, n, marker_color = 'red', marker_size = 10):
    if n > len(d_coord.keys()):
        n = len(d_coord.keys()) + 1
    else: n = n + 1
    print('Plotting the results...\n')
    plt.figure(figsize=(16, 8), dpi = 96)
    for i in range(1, n):
        filename, experimental_image, FCN_output, coordinates = d_coord[i-1].values()
        y, x = coordinates
        ax = plt.subplot(3, n, i)
        plt.imshow(experimental_image, cmap = 'gray')
        plt.title('Experimental image {0}\n'.format(i) + '$\itfilename$: ' + filename[0:20], fontsize = 10)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        plt.tight_layout()
        ax = plt.subplot(3, n, i + n)
        plt.imshow(FCN_output, cmap = 'jet', Interpolation = 'Gaussian')
        plt.imshow(experimental_image, cmap = 'gray', alpha = 0.4)
        plt.title('FCN output {0}'.format(i), fontsize = 10)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        plt.tight_layout()
        ax = plt.subplot(3, n, i + 2*n)
        plt.scatter(x, y, marker = "+", s = marker_size, linewidth = 1.5, color = marker_color)
        plt.xlim(0,target_size[0])
        plt.ylim(target_size[1],0)
        plt.imshow(experimental_image, cmap = 'gray')
        plt.title('Coordinates {0}'.format(i), fontsize = 10)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        plt.tight_layout()
    plt.show() 

make_plots(atomic_coord, n = 5)
Plotting the results...

Extract coordinates of defects

Same procedure as for the atomic lattice, but now applied to defects ("dopants" and "vacancies")

In [13]:
# Applying FCN and LoG to analysis of "dopants" in STEM images
decoded_imgs1 = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_adatom_mixed1.h5')  
dopant_coord = get_coordinates(STEM_real, decoded_imgs1, min_sigma = 2.5, threshold = 0.8, channel = 1, dist_edge = 0)
save_coord("Graphene/DL_paper/DL_output/dopant_coord", dopant_coord)

make_plots(dopant_coord, n = 5, marker_color = 'blue')
FCN output obtained

Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting the results...

In [14]:
# Applying FCN and LoG to analysis of "vacancies" in STEM images
decoded_imgs2 = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_vac_mixed2.h5') 
vac_coord = get_coordinates(STEM_real, decoded_imgs2, min_sigma = 5., channel = 1)
save_coord("Graphene/DL_paper/DL_output/vac_coord", vac_coord)

make_plots(vac_coord, n = 5, marker_color = 'orange')
FCN output obtained

Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting the results...

Put everything together.

We will now save all the (refined) coordinates of atoms and defects for each image as a separate csv file, which can be later used as an input for density functional theory calculations (which may include a “refinement” of structural properties as well as calculations of electronic and magnetic properties of the learned defects).

In [16]:
def save_all_coord(dir_name, atomic_lattice = 'C', dopant = 'Si'):
    dict_all_coord = {}
    for i in range(STEM_real.shape[0]):
        filename, _, _, coordinates = atomic_coord[i].values()
        _, _, _, coordinates_ = dopant_coord[i].values()
        if coordinates_.shape[1] > 0:
            tree = spatial.cKDTree(np.transpose(coordinates))
            overlap_indices = tree.query_ball_point(np.transpose(coordinates_), 2)
            overlap_indices = list(itertools.chain.from_iterable(overlap_indices))
            coordinates = np.transpose(np.delete(np.transpose(coordinates),overlap_indices, axis = 0))
        index = [atomic_lattice]*coordinates.shape[1]
        index_ = [dopant]*coordinates_.shape[1]
        index.extend(index_)
        coordinates = np.concatenate((coordinates, coordinates_), axis = 1)
        x, y = coordinates
        columns = ['x','y']
        df = pd.DataFrame({'x':x, 'y':y}, index = index, columns = columns)
        dictionary = OrderedDict()
        dictionary['coordinates'] = df
        dictionary['experimental image'] = STEM_real[i].reshape(target_size[0], target_size[1])
        dict_all_coord[filename] = dictionary
        df.to_csv(os.path.join(os.path.join(dir_name, filename + '.csv')))
        np.save(os.path.join(dir_name, 'all_coord.npy'), dict_all_coord)
    print('Atomic/defect coordinates saved to disk.')
    return dict_all_coord

dict_all_coord = save_all_coord('Graphene/DL_paper/DL_output/Coordinates/')
Atomic/defect coordinates saved to disk.

Applying a model trained on graphene to MoSe$_2$

Finally, we demonstrate that it is possible to use our trained model(s) for finding atoms and defects in a different material that has similar but distinct atomic lattice structure.

In [17]:
# Loading experimental data on MoSe2 doped with W. 
data_dir = "Graphene/DL_paper/STEM_real/Examples/MoWSe2/"
STEM_real, filenames = load_data(os.path.join(data_dir,"*.png"), target_size)
STEM_real = preprocessing(STEM_real, target_size)
Experimental data loaded from Graphene/DL_paper/STEM_real/Examples/MoWSe2/*.png 
no. of images: 5

Finding positions of atomic columns in MoSe$_2$ system

Our FCN model, which was trained on graphene lattice, can locate atomic columns in the STEM of Mo$_{1-x}$W$_x$Se$_2$ that has two inequivalent sites (in terms of their intensities/atomic number) in the unit cell associated with a single Mo atom and with two stacked Se atoms.

In [19]:
decoded_imgs_ = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_Lattice_AF0AF1.h5')  
dopant_coord = get_coordinates(STEM_real, decoded_imgs_, method = 'CoM', min_sigma = 1.5, threshold = 0.95, channel = 1, dist_edge = 3)
save_coord("Graphene/DL_paper/DL_output/MoSe2/dopant_coord", dopant_coord)

make_plots(dopant_coord, n = 5, marker_color = 'red')
FCN output obtained

Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting the results...

Finding positions of W dopants in MoSe$_2$ system

Our model is clearly able to detect W dopants which are characterized by a larger intensity in STEM. It is worth noting that the FCN was able to distinguish accurately between the variations in the intensity associated with two different sublattices and that associated with the presence of dopants, even though such specific information was not included in our training dataset.

In [20]:
decoded_imgs1_ = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_adatom_mixed1.h5')  
dopant_coord = get_coordinates(STEM_real, decoded_imgs1_, min_sigma = 1.5, threshold = 0.7, channel = 1, dist_edge = 5)
save_coord("Graphene/DL_paper/DL_output/MoSe2/dopant_coord", dopant_coord)

make_plots(dopant_coord, n = 5, marker_color = 'blue')
FCN output obtained

Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting the results...

Finding positions of atomic vacancies in MoSe$_2$ system

Our model found two types of blobs characterized by different intensities (softmax probabilities) and widths and associated with “full” vacancies (two Se atoms missing from a column) and “half” vacancies (one Se atom missing from a column). See the paper's main text for details. Importantly, it was not our initial goal to search for the “half” vacancies, nor was the network specifically trained to discover such structures.

In [22]:
decoded_imgs2_ = get_decoded_imgs(STEM_real, 'Graphene/DL_paper/Models/G_vac_mixed2.h5') 
vac_coord = get_coordinates(STEM_real, decoded_imgs2_, min_sigma = 1., max_sigma = 3, threshold = 0.2, channel = 1)
save_coord("Graphene/DL_paper/DL_output/MoSe2/vac_coord", vac_coord)

make_plots(vac_coord, n = 5, marker_color = 'orange')
FCN output obtained

Atomic/defect coordinates extracted.

Atomic/defect coordinates saved to disk.

Plotting the results...

Part III: From Pixels to Chemistry: Graph-Based Represenation of Detected Atomic Defects

Create graph nodes

We now show how one can represent defects in a form of graphs that alows to analyze them based on their chemical structure (e.g. coordination number and bond angles). We will demonstrate this procedure in details for one of the experimental STEM images. We start with uploading the saved (x,y) coordinates file from the DL+LoG/CoM analysis of this STEM image and creating graph nodes for each atomic positions.

In [27]:
# Load file with coordinates    
def load_coordinates(dirname_coord, dirname_exp, filename):
    dataframe_c = pd.read_csv(os.path.join(dirname_coord, filename), sep=',')
    filename_c = (os.path.splitext(os.path.basename(filename))[0])
    experimental_image = cv2.imread(os.path.join(dirname_exp, filename_c + '.png'), cv2.IMREAD_GRAYSCALE)
    experimental_image = cv2.resize(experimental_image, target_size, interpolation = cv2.INTER_AREA)
    return dataframe_c, filename_c, experimental_image

# Make Graph nodes    
def make_graph_nodes(dataframe):
    u_atoms, indices = np.unique(dataframe.values[:,0], return_index=True)
    print('Found the following atomic species:', ', '.join(u_atoms.tolist()))
    U=nx.Graph()
    for j in range(len(indices)):
        if j + 1 != len(indices):
            for i, (idx, x, y) in enumerate(dataframe.values[indices[j] : indices[j+1]]):
                U.add_node(idx+" {}".format(i+1), pos=(y, x))
        else:
            for i, (idx, x, y) in enumerate(dataframe.values[indices[j] : ]):
                U.add_node(idx+" {}".format(i+1), pos=(y, x))
    pos=nx.get_node_attributes(U,'pos')
    n_nodes = len(U.nodes())
    print('Created', str(n_nodes), 'graph nodes corresponding to atomic species') 
    return U, n_nodes, pos, u_atoms
    
dirname_coord = 'Graphene/DL_paper/DL_output/Coordinates/'
dirname_exp = 'Graphene/DL_paper/STEM_real/Examples/'
filename = '3_60kV-0136.csv'
df_c, filename_c, experimental_image = load_coordinates(dirname_coord, dirname_exp, filename)
U, n_nodes, pos, atomic_species = make_graph_nodes(df_c)
Found the following atomic species: C, Si
Created 173 graph nodes corresponding to atomic species

Create graph edges

We now apply a constraints based on maximum allowed length of chemical bonds for pairs of atomic species detected earlier to creat edges between the nodes. These are provided as a user input.

In [28]:
# Calculates distances between nodes of a given graph(s)
def dist(U1, U2, p1, p2):
    return np.sqrt((U1.node[p1]['pos'][1]-U2.node[p2]['pos'][1])**2 + 
           (U1.node[p1]['pos'][0]-U2.node[p2]['pos'][0])**2)

# Creates dictionary of atomic pairs with a maximum allowed bond length for each pair (as user's input)
def atomic_pairs_data(atomic_species, target_size, *args, **kwargs):
    try:
        image_size, = kwargs.values()
    except:
        image_size = float(input("\nEnter the size of the image '" + str(filename_c) + "' in picometers:\n"))
    atomic_pairs =  list(set(tuple(sorted(a)) for a in itertools.product(atomic_species, repeat = 2)))
    atomic_pairs_dict = {}
    for pair in atomic_pairs:
        dictionary = {}
        if args:
            for pair_i in list(itertools.permutations(pair)):
                if pair_i in args[0].keys():
                    l = args[0][pair_i]
        else:
            print('\nEnter the maximum allowed bond length (in picometers) for a pair', str(pair) + ':')
            l = float(input())
        l_px = l*(target_size[0]/image_size)
        dictionary['atomic_pair_bond_length'] = l_px
        atomic_pairs_dict[pair] = dictionary
    
    return atomic_pairs_dict, image_size

    
# Add edges to a graph (aka "chemical bonds")
def create_graph_edges(U, atomic_pairs_d):
    for k in atomic_pairs_d.keys():
        for p1 in U.nodes():
            for p2 in U.nodes():
                if all([(p1.split()[0] == k[0]),
                        (p2.split()[0] == k[1])]):
                    distance = dist(U, U, p1, p2)
                    if 0 < distance < atomic_pairs_d[k]['atomic_pair_bond_length']:
                        U.add_edge(p1, p2)

atomic_pairs_d, image_size = atomic_pairs_data(atomic_species, target_size)                    
create_graph_edges(U, atomic_pairs_d)
Enter the size of the image '3_60kV-0136' in picometers:
2200

Enter the maximum allowed bond length (in picometers) for a pair ('C', 'Si'):
210

Enter the maximum allowed bond length (in picometers) for a pair ('C', 'C'):
175

Enter the maximum allowed bond length (in picometers) for a pair ('Si', 'Si'):
250

Determine atomic defects coordination number and 1st 'coordination sphere'

In [29]:
# Calculates a coordination number for atomic species of interest (in non-refined structure) 
# and lists atomic species in their 1st coordination sphere
def coord_no(dopant):
    print('\n\nDopant coordination:')
    i = 1
    for atuple in [degree for degree in U.degree().items() if dopant in degree[0]]:
        print('\n#{}'.format(i), dopant, "dopant coordination:", str(atuple[1]) + "-fold")
        i += 1
    i = 1    
    for d in [node for node in U.nodes() if node.split()[0] == dopant]:
        nbs_list = []
        for nbs in U.neighbors(d):
            nbs_list.append(nbs.split()[0])
        print('\n#{}'.format(i), dopant, "dopant 1st coordination 'sphere':", ', '.join(nbs_list))
        i += 1
        
dopant = str(input('Enter the name of dopant (e.g. N, P, Si, etc):\n'))
coord_no(dopant)
Enter the name of dopant (e.g. N, P, Si, etc):
Si


Dopant coordination:

#1 Si dopant coordination: 4-fold

#2 Si dopant coordination: 4-fold

#1 Si dopant 1st coordination 'sphere': C, C, C, C

#2 Si dopant 1st coordination 'sphere': C, C, C, C

Plot resultant graph

In [30]:
def plot_graph(Graph, atomic_labels = True, overlay = False, draw_bound_box = False, draw_defect_labels = False):
    
    c_nodes = 5.*(image_size/n_nodes)
    c_fonts = 0.5*(image_size/n_nodes)
    
    color_map = []
    colors = ['red', 'blue', 'orange', 'black', 'magenta', 'gray', 'green']
    for node in U.nodes():
        for i in range(len(atomic_species)):
            if node.split()[0] == atomic_species[i]:
                    color_map.append(colors[i])

    labels_atoms = {}    
    for i in range (len(atomic_species)):
        for node in U.nodes():
            if node.split()[0] == atomic_species[i]:
                labels_atoms[node] = atomic_species[i]
                    
    fig = plt.figure(figsize=(6, 3), dpi = 150) 
    
    ax = fig.add_subplot(1, 2, 1)
    plt.xlim(10,120)
    plt.ylim(120,10)
    plt.xticks([])
    plt.yticks([])
    ax.set_aspect('equal')
    plt.title('Experimental image\n' + filename_c, fontsize = 8)
    plt.imshow(experimental_image, cmap = 'gray')
    
    ax = fig.add_subplot(1, 2, 2)
    alpha_ = 1
    edge_color_ = 'gray'
    if overlay == True:
        plt.imshow(experimental_image, cmap = 'gray')
        alpha_ = 0.4
        edge_color_ = 'white'
    plt.title('Graph representation\n' + filename_c, fontsize = 8)
    plt.xlim(10,120)
    plt.ylim(120,10)
    plt.xticks([])
    plt.yticks([])
    ax.set_aspect('equal')

    nx.draw_networkx_nodes(U, pos = pos, nodelist = U.nodes(), node_color = color_map,
                           node_size = c_nodes, alpha = alpha_)
    nx.draw_networkx_edges(U,pos,width=1.5, edge_color = edge_color_, alpha = alpha_)
    if atomic_labels == True:
        nx.draw_networkx_labels(U,pos,labels = labels_atoms,font_size = c_fonts)
    elif atomic_labels == False:
        nx.draw_networkx_labels(U,pos,font_size = c_fonts/2)
    
    if draw_bound_box == True:
        try:
            for l in range(len(defect_com.keys())):
                x = int(np.around(defect_com[l][0]))
                y = int(np.around(defect_com[l][1]))
                bb_size = get_bound_box(sub_graphs)[l]*1.75
                rect = patches.Rectangle((x-bb_size/2,y-bb_size/2), bb_size, bb_size,
                             linewidth=1,edgecolor='orange',facecolor='none')
                ax.add_patch(rect)
        except:
            print('!! Information on the bound box parameters was not found !!')
    
    if draw_defect_labels == True:
        try:
            for l in range(len(defect_com.keys())):
                x = int(np.around(defect_com[l][0]))
                y = int(np.around(defect_com[l][1]))
                bb_size = get_bound_box(sub_graphs)[l]
                if bb_size/2 <= 12:
                    fontsize = np.around(bb_size/2)
                else: fontsize = 12
                plt.text(x-0.75*bb_size, y-bb_size, 'Defect #'+str(l+1), style='italic', color = 'orange', fontsize = fontsize)
        except:
            print('!! Information on defect labels was not found !!')
    
    plt.show()
    

plot_graph(U, atomic_labels = True)

Extracting single defect structures

First we remove lattice atoms which are not directly connected to dopants/defects. We then also introduce constraints for a maximum possible number of chemical bonds (edges) for an atom (node). This can be provided as a user input.

In [31]:
def refine_structure(U, *args, **kwargs):
    try: 
        lattice_atom, dopant = args[0].values()
    except:
        lattice_atom = str(input('\nEnter the name of lattice atoms (e.g. C, Ga, Si, etc):\n'))
        dopant = str(input('\nEnter the name of dopant (e.g. N, P, Si, etc):\n'))
    
    remove_l_edges = [edge for edge in U.edges() if dopant not in str(edge)]
    U.remove_edges_from(remove_l_edges)
    remove_lone_atoms = [node for node, degree in U.degree().items() if degree == 0 and node.split()[0] == lattice_atom]
    U.remove_nodes_from(remove_lone_atoms)
    print('\nAll lattice atoms not directly connected to a dopant has been removed\n')
    
    try:
        max_coord, = kwargs.values()
    except:
        max_coord = int(input('Enter the expected maximum coordination number for dopant atom:\n'))
    
    dopant_extra = [node for node, degree in U.degree().items() if degree > max_coord and node.split()[0] == dopant]

    for atom in dopant_extra:
        remove_dopant_edges = []
        while len(U.neighbors(atom)) > max_coord:
            neighbors = U.neighbors(atom)
            neighbors = [n for n in neighbors if n.split()[0] != dopant]
            Distance = np.array([])
            for nbs in neighbors:
                Dist = dist(U, U, nbs, atom)
                Distance = np.append(Distance, Dist)
            dist_max = np.unravel_index(np.argmax(Distance),Distance.shape)
            remove_dopant_edges.append(tuple((atom, neighbors[dist_max[0]])))
            U.remove_edges_from(remove_dopant_edges)

    remove_lone_atoms = [node for node, degree in U.degree().items() if degree == 0 and node.split()[0] == lattice_atom]
    U.remove_nodes_from(remove_lone_atoms)

    print('\nRefinement procedure based on the maximum coordination number has been completed\n')

refine_structure(U)
Enter the name of lattice atoms (e.g. C, Ga, Si, etc):
C

Enter the name of dopant (e.g. N, P, Si, etc):
Si

All lattice atoms not directly connected to a dopant has been removed

Enter the expected maximum coordination number for dopant atom:
4

Refinement procedure based on the maximum coordination number has been completed

Full enumeration and characterization of the atomic defects

We plot refined structures of single defects and display/store the information on the coordination number, coordination 'spheres' and relevant bond angles separately for each defect. (Note that a coordination number of dopants has not changes after refinement, which is due to high accuracy of atom/defect localization via DL+LoG). The current scheme can be easily applied to a much larger number of (different) defects per experimental image.

In [32]:
# Detects disconnected subgraphs (i.e. individial defects)
def get_subgraphs(U):
    sub_graphs = list(nx.connected_component_subgraphs(U))
    print('\n\nIdentified', len(sub_graphs), 'defect structures')
    return sub_graphs

# Calculates the center of mass for each defect
def get_defect_com(sub_graphs):
    defect_c_mass = {}
    for i, sg in enumerate(sub_graphs):
        defect_com_x = np.array([])
        defect_com_y = np.array([])
        for d in sg.node:
            defect_com_x = np.append(defect_com_x, sg.node[d]['pos'][0])
            defect_com_y = np.append(defect_com_y, sg.node[d]['pos'][1])
        defect_c_mass[i] = [np.mean(defect_com_x), np.mean(defect_com_y)]
    return defect_c_mass

def get_bound_box(sub_graphs):
    bond_length_bound_box = {}
    for i, sg in enumerate(sub_graphs):
        bond_length = {}
        bond_length_px = []
        defect_atoms = [node for node in sg.nodes()]
        for p1 in defect_atoms:
            for p2 in defect_atoms:
                bond_length_px.append(dist(U, U, p1, p2))
                bond_length_bound_box[i] = np.amax(bond_length_px)
    return bond_length_bound_box

# Calculates a coordination number for atomic species of interest
# and lists atomic species in their 1st coordination sphere
def get_coord_no(dopant, sub_graphs):
    for i, sg in enumerate(sub_graphs):
        print('\n\nDefect #{}:'.format(i+1))
        for j, atuple in enumerate([degree for degree in sg.degree().items() if dopant in degree[0]]):
            print('#{}'.format(j+1), '('+str(atuple[0])+')', dopant, 
                  "dopant coordination:", str(atuple[1]) + "-fold")
        for k, d in enumerate([node for node in sg.nodes() if node.split()[0] == dopant]):
            nbs_list = []
            for nbs in sg.neighbors(d):
                nbs_list.append(nbs.split()[0])
            print('#{}'.format(k+1), '('+str(d)+')', dopant, 
                  "dopant 1st coordination 'sphere':", ', '.join(nbs_list))

# Calculates bond lengths for individual defect structures
def get_bond_lengths(dopant, sub_graphs):
    defect_graphs_bonds = {}
    for i, sg in enumerate(sub_graphs):
        bond_length = {}
        for p1 in [node for node in sg.nodes() if node.split()[0] == dopant]:
            for p2 in sg.neighbors(p1):
                l = [p1, p2]
                sc = image_size/target_size[0]
                bond_length[' - '.join(l)] = (' - '.join(l)), dist(U, U, p1, p2)*sc
                defect_graphs_bonds[i] = bond_length
    return defect_graphs_bonds
                
# Calculates bond angles for individual defect structures 
def get_bond_angles(dopant, sub_graphs):
    defect_graphs_angles = {}   
    for i, sg in enumerate(sub_graphs):
        angles = {}
        for p1 in [node for node in sg.nodes() if node.split()[0] == dopant]:
            for atuple in list(set(tuple(sorted(a)) for a in itertools.product(sg.neighbors(p1), repeat = 2))):
                if atuple[0] != atuple[1]:
                    points = [atuple[0], p1, atuple[1]]
                    u = np.array(sg.node[points[1]]['pos']) - np.array(sg.node[points[0]]['pos'])
                    v = np.array(sg.node[points[1]]['pos']) - np.array(sg.node[points[2]]['pos'])
                    a = np.dot(u, v)
                    b = np.linalg.norm(u) * np.linalg.norm(v)
                    angles[' - '.join(points)] = ((' - '.join(points), (np.arccos(a/b) * 180 / np.pi)))
                    defect_graphs_angles[i] = angles
    
    return defect_graphs_angles

# prints defect paramters of interest
def print_defect_parameters(parameter, constr1 = 0, constr2 = float('inf')):
    if len(list(parameter[0].keys())[0]) > 15:
        units = ['Bond angles', 'degrees']
    else: units = ['Bond lengths','pm']
    print('\n\n'+str(units[0]), 'for each single defect structure:')
    for i, sg in enumerate(parameter):
        print('\nDefect #{}'.format(i+1))
        for p in parameter[sg].keys():
            if parameter[sg][p][1] > constr1 and parameter[sg][p][1] < constr2:
                print(parameter[sg][p][0] + ": ", np.around(parameter[sg][p][1], decimals = 2), units[1])

sub_graphs = get_subgraphs(U)
defect_com = get_defect_com(sub_graphs)
get_coord_no(dopant, sub_graphs)
print_defect_parameters(get_bond_angles(dopant, sub_graphs), constr1 = 20, constr2 = 160)
plot_graph(U,  atomic_labels = False, overlay =True, draw_bound_box = True, draw_defect_labels = True)

Identified 2 defect structures


Defect #1:
#1 (Si 1) Si dopant coordination: 4-fold
#1 (Si 1) Si dopant 1st coordination 'sphere': C, C, C, C


Defect #2:
#1 (Si 2) Si dopant coordination: 4-fold
#1 (Si 2) Si dopant 1st coordination 'sphere': C, C, C, C


Bond angles for each single defect structure:

Defect #1
C 58 - Si 1 - C 62:  82.5 degrees
C 38 - Si 1 - C 42:  90.0 degrees
C 38 - Si 1 - C 58:  97.5 degrees
C 42 - Si 1 - C 62:  90.0 degrees

Defect #2
C 100 - Si 2 - C 83:  93.75 degrees
C 73 - Si 2 - C 83:  86.25 degrees
C 100 - Si 2 - C 87:  95.71 degrees
C 73 - Si 2 - C 87:  84.29 degrees

Put everything together

... and obtain a full decoding of various experimental images with different types of defects. This method can now be used for classification of different defect types in one image as well as movies of defect transformations. The latter includes, for example, tracking of 3-fold coordination to 4-fold coordination switching of a Si defect, for which we can assume that the species associated with the lattice and dopant stay the same (i.e. C and Si) and skip a user input part.

In [33]:
dirname_coord = 'Graphene/DL_paper/DL_output/Coordinates/'
dirname_exp = 'Graphene/DL_paper/STEM_real/Examples'
filename_list = ['2_60kV-0081-1.csv', '6_SuperScan-HAADF-12.csv', '7_Ar-O-cleaned-gr-120C-10hrs-60kV-0024.csv']

atoms = OrderedDict()
atoms['lattice_atom'] = 'C'
atoms['dopant'] = 'Si'
approx_max_bonds = {('Si', 'Si'): 250, ('C', 'C'): 175, ('Si', 'C'): 210}
# C-C bonds max length estimation was taken from: RSC Adv. 2015, 5, 105194
# Si-C bonds max length estimation was taken from: Nanoscale, 2010, 2, 1733 
for filename in filename_list:
    df_c, filename_c, experimental_image = load_coordinates(dirname_coord, dirname_exp, filename)
    U, n_nodes, pos, atomic_species = make_graph_nodes(df_c)
    
    atomic_pairs_d, image_size = atomic_pairs_data(atomic_species, target_size, approx_max_bonds)                    
    create_graph_edges(U, atomic_pairs_d)

    refine_structure(U, atoms, max_coord = 4)

    sub_graphs = get_subgraphs(U)
    defect_com = get_defect_com(sub_graphs)
    get_coord_no(dopant, sub_graphs)
    print_defect_parameters(get_bond_angles(dopant, sub_graphs))
    plot_graph(U, atomic_labels = False, overlay = True, draw_bound_box = True, draw_defect_labels = True)
Found the following atomic species: C, Si
Created 239 graph nodes corresponding to atomic species

Enter the size of the image '2_60kV-0081-1' in picometers:
2940

All lattice atoms not directly connected to a dopant has been removed


Refinement procedure based on the maximum coordination number has been completed



Identified 2 defect structures


Defect #1:
#1 (Si 2) Si dopant coordination: 3-fold
#1 (Si 2) Si dopant 1st coordination 'sphere': C, C, C


Defect #2:
#1 (Si 1) Si dopant coordination: 3-fold
#1 (Si 1) Si dopant 1st coordination 'sphere': C, C, C


Bond angles for each single defect structure:

Defect #1
C 102 - Si 2 - C 95:  119.05 degrees
C 102 - Si 2 - C 81:  126.87 degrees
C 81 - Si 2 - C 95:  114.08 degrees

Defect #2
C 53 - Si 1 - C 59:  124.25 degrees
C 35 - Si 1 - C 53:  119.98 degrees
C 35 - Si 1 - C 59:  115.77 degrees
Found the following atomic species: C, Si
Created 135 graph nodes corresponding to atomic species

Enter the size of the image '6_SuperScan-HAADF-12' in picometers:
2000

All lattice atoms not directly connected to a dopant has been removed


Refinement procedure based on the maximum coordination number has been completed



Identified 1 defect structures


Defect #1:
#1 (Si 2) Si dopant coordination: 4-fold
#2 (Si 3) Si dopant coordination: 4-fold
#3 (Si 1) Si dopant coordination: 4-fold
#1 (Si 2) Si dopant 1st coordination 'sphere': C, Si, Si, C
#2 (Si 3) Si dopant 1st coordination 'sphere': C, Si, C, Si
#3 (Si 1) Si dopant 1st coordination 'sphere': C, Si, C, Si


Bond angles for each single defect structure:

Defect #1
C 81 - Si 3 - Si 1:  156.32 degrees
C 47 - Si 1 - C 57:  100.3 degrees
C 57 - Si 1 - Si 3:  101.31 degrees
C 73 - Si 3 - C 81:  105.01 degrees
C 47 - Si 1 - Si 3:  158.39 degrees
Si 2 - Si 1 - Si 3:  60.26 degrees
C 47 - Si 1 - Si 2:  98.13 degrees
C 52 - Si 2 - Si 3:  156.37 degrees
C 66 - Si 2 - Si 3:  106.5 degrees
Si 1 - Si 3 - Si 2:  61.5 degrees
C 66 - Si 2 - Si 1:  164.74 degrees
C 81 - Si 3 - Si 2:  94.81 degrees
C 52 - Si 2 - C 66:  97.13 degrees
C 73 - Si 3 - Si 2:  160.18 degrees
C 73 - Si 3 - Si 1:  98.67 degrees
Si 1 - Si 2 - Si 3:  58.24 degrees
C 57 - Si 1 - Si 2:  161.57 degrees
C 52 - Si 2 - Si 1:  98.13 degrees
Found the following atomic species: C, Si
Created 148 graph nodes corresponding to atomic species

Enter the size of the image '7_Ar-O-cleaned-gr-120C-10hrs-60kV-0024' in picometers:
2000

All lattice atoms not directly connected to a dopant has been removed


Refinement procedure based on the maximum coordination number has been completed



Identified 1 defect structures


Defect #1:
#1 (Si 2) Si dopant coordination: 4-fold
#2 (Si 1) Si dopant coordination: 3-fold
#1 (Si 2) Si dopant 1st coordination 'sphere': C, Si, C, C
#2 (Si 1) Si dopant 1st coordination 'sphere': C, C, Si


Bond angles for each single defect structure:

Defect #1
C 104 - Si 2 - Si 1:  169.07 degrees
C 105 - Si 2 - C 88:  151.53 degrees
C 88 - Si 2 - Si 1:  88.8 degrees
C 104 - Si 2 - C 88:  102.13 degrees
C 91 - Si 1 - Si 2:  144.03 degrees
C 105 - Si 2 - Si 1:  62.73 degrees
C 80 - Si 1 - Si 2:  96.29 degrees
C 80 - Si 1 - C 91:  119.67 degrees
C 104 - Si 2 - C 105:  106.34 degrees
In [ ]: