clang plug-in instrumentation of llvm source code and analysis of function call logs (1)–google mirror

tick_plot__compile.ipynb

Duration boundary_time chain anomaly: long and short function call chain list

0. Use matplotlib to find font files larger than 1MB in the system

Chinese fonts are usually very large, so those filtered out usually have Chinese fonts.
In the results, the name AR PL UMing CN’ is indeed a Chinese font.

from matplotlib.font_manager import fontManager
import os
 
fonts = [font.name for font in fontManager.ttflist if
         os.path.exists(font.fname) and os.stat(font.fname).st_size>1e6]
 
for font in fonts:
    if 'CN' in font:
        print(font)
 AR PL UMing CN
AR PL UKai CN

!pip install scikit-learn

1. Generate tick log

Syntax error when compiling minimum main plus function 1: file mini_main_f1_err.c

//File mini_main_f1_err.c
char* calc_name(float age, bool high, char* nick){<!-- -->
  if(high & amp; & amp; age>5){<!-- -->
    return "child";
  }
  char name[32]={<!-- -->"bigPeople"};
  return name;
}
int main(int argc, char** argv){<!-- -->
    calc_name(10,false,"nick");
    return 0;
}


tick_save=true /pubx/build-llvm15/bin/clang-15 -c mini_main_printf.c

#corrupted double-linked list
#pure virtual method called
#terminate called without an active exception
#Abandoned (core dumped)

#Generate tick log file:
#-rw-rw-r-- 511M August 12 08:24 clang-15_13324_1691799886944_1


#There is an output target file
ls -lh mini_main_printf.o
#-rw-rw-r-- 1.4K August 12 08:29 mini_main_printf.o


#But if you try to output an executable file, the crash will be more complete:

tick_save=true /pubx/build-llvm15/bin/clang-15 mini_main_printf.c -o mmp

"""
pure virtual method called
terminate called without an active exception
clang-15: error: unable to execute command: Aborted (core dumped)
clang-15: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 15.0.0 ([email protected]:pubz/llvm-project.git 3387b19bb538e694d2d965d46c7b053d61a059e3)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /pubx/build-llvm15/bin
clang-15: note: diagnostic msg:
*********************

PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
Preprocessed source(s) and associated run script(s) are located at:
clang-15: note: diagnostic msg: /tmp/mini_main_printf-f6d921.c
clang-15: note: diagnostic msg: /tmp/mini_main_printf-f6d921.sh
clang-15: note: diagnostic msg:

*********************
malloc(): unsorted double linked list corrupted
malloc(): unsorted double linked list corrupted
malloc(): unsorted double linked list corrupted
malloc(): unsorted double linked list corrupted
malloc(): unsorted double linked list corrupted
...
Segmentation fault (core dumped)
"""

Preliminary analysis of the crash

Preliminary analysis of crash
Use gdb to check where the crash occurred. You can see that it is very strange. Main has been completed and crashed. The log is output normally. I don’t care about it for the time being.

gdb --args /pubx/build-llvm15/bin/clang-15 -c mini_main_printf.c

(gdb) set environment tick_save=true

(gdb) run
Starting program: /build/pubx/build-llvm15/bin/clang-15 -c mini_main_printf.c
[Thread debugging using libthread_db enabled]

Program received signal SIGSEGV, Segmentation fault.
0x00007ffff78a17c3 in unlink_chunk (p=p@entry=0x55555cbfc0b0, av=0x7ffff7a19c80 <main_arena>) at ./malloc/malloc.c:1634
1634 ./malloc/malloc.c: No such file or directory.

(gdb) bt
#0 0x00007ffff78a17c3 in unlink_chunk (p=p@entry=0x55555cbfc0b0, av=0x7ffff7a19c80 <main_arena>) at ./malloc/malloc.c:1634
#1 0x00007ffff78a2939 in _int_free (av=0x7ffff7a19c80 <main_arena>, p=0x55555cbfc0b0, have_lock=<optimized out>) at ./malloc/malloc.c:4607
#2 0x00007ffff78a54d3 in __GI___libc_free (mem=<optimized out>) at ./malloc/malloc.c:3391
#3 0x00005555570ebb47 in llvm::PassRegistry::~PassRegistry() ()
#4 0x00007ffff7845495 in __run_exit_handlers (status=0, listp=0x7ffff7a19838 <__exit_funcs>, run_list_atexit=run_list_atexit@entry=true, run_dtors=run_dtors@entry=true)
    at ./stdlib/exit.c:113
#5 0x00007ffff7845610 in __GI_exit (status=<optimized out>) at ./stdlib/exit.c:143
#6 0x00007ffff7829d97 in __libc_start_call_main (main=main@entry=0x5555561c70e0 <main>, argc=argc@entry=3, argv=argv@entry=0x7ffffffffdc38) at ../sysdeps/nptl/libc_start_call_main.h:74
#7 0x00007ffff7829e40 in __libc_start_main_impl (main=0x5555561c70e0 <main>, argc=3, argv=0x7fffffffdc38, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>,
    stack_end=0x7ffffffffdc28) at ../csu/libc-start.c:392
#8 0x00005555561aef05 in _start ()

1. Prefix

# %config InlineBackend.figure_format = 'svg'
%config InlineBackend.rc={<!-- -->'figure.figsize': (30,20)}

sklearn various normalizations

#sklearn Various normalizations
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

minMaxScaler = MinMaxScaler()
standardScaler = StandardScaler()
maxAbsScaler = MaxAbsScaler()


# df['Stack Net'] = minMaxScaler.fit_transform(df[['Stack Net']])

Guide package

import numpy
import math
import seaborn
import plotly.offline as plotly_offline
plotly_offline.init_notebook_mode(connected=True)
import plotly.graph_objs as graph_objs
import plotly.figure_factory as figure_factory

from plotly.graph_objs import Scatter, Figure, Scatter3d

Solve the problem of abnormal display of Chinese in matplotlib. Chinese is displayed as squares. The reason is that there is no Chinese font. Here, set the font to the Chinese font found above AR PL Uming CN’

import matplotlib.pyplot
matplotlib.pyplot.rcParams['font.family'] = 'AR PL UMing CN'
matplotlib.pyplot.rcParams['font.sans-serif'] = [ 'AR PL UMing CN']

pandas displays the maximum number of rows

import pandas
# pandas.options.display.max_columns = None
# pandas.options.display.max_rows = None

1b Drawing pre-processing

#Increase the size of the legend color ball
def increase_g_size(g):
    for lh in g.legend_.legendHandles:
        lh.set_alpha(1)
        lh._sizes = [500]
import warnings
# Filter out RuntimeWarning warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

2. Load tick log

%?sh
ls -lhrt /tick_data_home/ | tail -n 1
#-rw-rw-r-- 1 zz zz 511M August 14 18:11 clang-15_13324_1691799886944_1
#Compilation is normal: mini_main.c
df=pandas.read_csv(filepath_or_buffer="/tick_data_home/clang-15_13324_1691799886944_1",sep=',' , quotechar="'")

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3236685 entries, 0 to 3236684
Data columns (total 22 columns):
 # Column Dtype
--- ------ -----
 0 ticks int64
 1 funcLocalClock int64
 2 tickKind int64
 3 funcEnterId int64
 4 hasFuncCallChain int64
 5 funcEnterIdSeqLen int64
 6 funcEnterIdSeq object
 7 rTSVarC int64
 8d stack raw int64
 9d stack dead int64
 10d heap raw int64
 11 d heap death int64
 12 stack int64
 13 stack dead int64
 14 stack net int64
 15 heap int64
 16 heap int64
 17 heap net int64
 18 srcFile object
 19 funcLine int64
 20 funcCol int64
 21 funcName object
dtypes: int64(19), object(3)
memory usage: 543.3 + MB

tickKind definition

File: /pubx/clang-ctk/t_clock_tick/t_clock_tick.cpp
`cpp
/*Tick type
 The ticks that need to be analyzed normally are normal ticks and function return ticks.
Normal analysis does not require the function entry tick to see where X__funcReturn is inserted: compare the function entry tick and the function return tick to see if they match.
*/
enum TickKind{<!-- -->
 //Normal tick
 NormalTick=0,
 //The function entry tick can be compared with the function return tick to see where less X__funcReturn is inserted.
 FuncEnter=1,
 //Function returns tick
 FuncReturn=2

};

NormalTick=0
#Function entry
FuncEnter=1
#Function returns tick
FuncReturn=2

We do not pay attention to general ticks here, so deleting general ticks can greatly improve the running speed of this script

print(df.shape)

df.drop( df[df['tickKind'] == NormalTick].index, inplace=True)

print(df.shape)
(3236685, 22)
(666718, 22)

Because the funcEnterIdSeq structure in tick.cpp is very dirty, it needs to be cleaned

funcEnterIdSeq only has a normal string when the function enters. Others (such as function exit, tick) will have NAN, so NAN must be replaced with an empty string.

df['funcEnterIdSeq'].fillna('', inplace=True)

” funcEnterIdSeq ” obviously has an extra pair of single quotes, remove the extra pair of single quotes
-2100558033#2#1# The first negative number on the left is caused by taking a memory area that is not your own and needs to be removed.

df['funcEnterIdSeq']=df.funcEnterIdSeq.apply(lambda seqK: seqK[seqK.find("#") + 1:].replace("'","\ ") )

3. funcId structure

df.head(1)

3.1 New column funcLoc is equivalent to funId in string style

#New column funcLoc
df['funcLoc']=df .apply(lambda r: f'{<!-- -->r.srcFile}_{<!-- -->r.funcLine}_{<!-- -->r.funcCol}', axis=1 )
df['funcLoc'].values[:4]
array(['/pubx/llvm-project/llvm/lib/Support/CommandLine.cpp_42_42',
       '/pubx/llvm-project/llvm/lib/Support/CommandLine.cpp_41_41',
       '/pubx/llvm-project/llvm/lib/Support/ManagedStatic.cpp_77_77',
       '/pubx/llvm-project/llvm/lib/Support/Threading.cpp_36_36'],
      dtype=object)

3.2 Use the interval [0, the number of non-repeating funcLoc-1] as the funcId table

funcLoc_values=df['funcLoc'].values
type(funcLoc_values)#numpy.ndarray
len(funcLoc_values)#1014494

funcLoc_list=list(funcLoc_values)
len(funcLoc_list)#1014494

#funcLocConvert to collection
funcLoc_set=set(funcLoc_values)
len(funcLoc_set)#2838

#funcLoc collection as a list: that is, a list of unique funcLoc
uqFuncLoc_Ls=list(funcLoc_set)
len(uqFuncLoc_Ls)#2838
#uq:unique

#Unduplicated funcLoc list converted to funcId table
funcId_Tab=dict( [(fL,j) for j,fL in enumerate(uqFuncLoc_Ls)] )
list(funcId_Tab.items())[:5]

#In the previous version, this output source file path field has other characters stuck to the left and right, which is obviously caused by the use of the released memory area.
#In this version, there is no such problem. The source file path field is very clean, indicating that the problem has been fixed.
 [('/pubx/llvm-project/llvm/lib/Support/MemoryBuffer.cpp_83_83', 0),
 ('/pubx/llvm-project/clang/lib/Driver/Driver.cpp_58_58', 1),
 ('/pubx/llvm-project/clang/lib/CodeGen/CodeGenTypes.cpp_69_69', 2),
 ('/pubx/llvm-project/llvm/lib/MC/MCFragment.cpp_64_64', 3),
 ('/pubx/llvm-project/llvm/lib/Support/VirtualFileSystem.cpp_57_57', 4)]
#Check the funcLoc table by funcId
funcId2Loc_Tab=dict( [(j,fL) for j,fL in enumerate(uqFuncLoc_Ls)] )
print("funcId number:",len(funcId_Tab))
Number of #funcIds: 2738

3.3 Refer to the funLoc column and funcId table to add a new column funcId

#New column funcId
df['funcId']=df['funcLoc'] .apply(lambda fLocJ: funcId_Tab[fLocJ] )
df['funcId'].values[:10]
df.head(2)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 666718 entries, 0 to 3236683
Data columns (total 24 columns):
 # Column Non-Null Count Dtype
--- ------ -------------- -----
 0 ticks 666718 non-null int64
 1 funcLocalClock 666718 non-null int64
 2 tickKind 666718 non-null int64
 3 funcEnterId 666718 non-null int64
 4 hasFuncCallChain 666718 non-null int64
 5 funcEnterIdSeqLen 666718 non-null int64
 6 funcEnterIdSeq 666718 non-null object
 7 rTSVarC 666718 non-null int64
 8 d stack student 666718 non-null int64
 9 d stack death 666718 non-null int64
 10 d heap 666718 non-null int64
 11 d heap death 666718 non-null int64
 12 stack student 666718 non-null int64
 13 stack dead 666718 non-null int64
 14 stack net 666718 non-null int64
 15 heap 666718 non-null int64
 16 heap death 666718 non-null int64
 17 heap net 666718 non-null int64
 18 srcFile 666718 non-null object
 19 funcLine 666718 non-null int64
 20 funcCol 666718 non-null int64
 21 funcName 666718 non-null object
 22 funcLoc 666718 non-null object
 23 funcId 666718 non-null int64
dtypes: int64(20), object(4)
memory usage: 127.2 + MB