N-Gram Algorithm for a 32-Bit NASM Program
#include
#include
int n_gram(char* str_1, int size_1, char* str_2, int size_2, int n);
int main()
{
char buffer[512];
FILE *input;
int n;
int result;
char string1[100];
char string2[100];
printf("Input filename?: ");
fgets(buffer, 512, stdin);
buffer[strlen(buffer) - 1] = 0;
input = fopen(buffer, "rt");
if (input == NULL)
{
printf("Unable to open file!\n");
return 1;
}
while (fgets(buffer, 512, input) != NULL)
{
sscanf(buffer, "%d %s %s", &n, string1, string2);
result = n_gram(string1, strlen(string1), string2, strlen(string2), n);
printf("Similarity %d-gram between %s and %s: %d%%\n", n, string1, string2, result);
}
fclose(input);
return 0;
}
SECTION .data
SnS DD 0 ; length of intersection
SuS DD 0 ; length of union
SECTION .text
global n_gram
; int n_gram(char* str_1, int size_1, char* str_2, int size_2, int n);
n_gram:
push ebp ; save frame pointer
mov ebp, esp ; create new stack frame
push ebx ; save registers on stack
push esi
push edi
mov DWORD[SnS], 0 ; initialize length of intersection to zero
mov DWORD[SuS], 0 ; initialize length of union to zero
mov ecx, [ebp + 12] ; load size1
mov esi, [ebp + 8] ; load pointer to str1
loop1:
cmp ecx, [ebp + 24] ; compare remaining chars with n
jl endloop1 ; if remaining < n, end loop
push ecx ; save ecx on stack
inc DWORD[SuS] ; this is a new ngram, increment union length
mov edi, [ebp + 16] ; load pointer to str2
mov edx, [ebp + 20] ; load size2
loop2:
cmp edx, [ebp + 24] ; compare remaining chars with n
jl endloop2 ; if remaining < n, end loop
cmp esi, [ebp + 8] ; see if this is the first loop
jne skip ; if not, skip
inc DWORD[SuS] ; else, this is a new ngram, increment union length
skip:
mov ebx, 0 ; initialize index to 0
compare:
mov al, [esi + ebx] ; load character from first string
cmp al, [edi + ebx] ; compare with second string
jne next ; if not equal, go to next ngram
inc ebx ; advance to next char
cmp ebx, [ebp + 24] ; compare current index with n
jl compare ; repeat while i < n
inc DWORD[SnS] ; we found an intersection, increment length
dec DWORD[SuS] ; decrement union length since one value was repeated
next:
inc edi ; advance to next ngram in second string
dec edx ; decrement remaining chars of second string
jmp loop2
endloop2:
inc esi ; advance to next ngram in first string
pop ecx ; restore ecx
dec ecx ; decrement remaining chars of first string
jmp loop1
endloop1:
mov eax, 100 ; load 100 to convert to percentages
mul DWORD[SnS] ; multiply by intersection length
mov edx, 0 ; clear edx for making division
div DWORD[SuS] ; divide 100* SnS / SuS to get similarity
pop edi ; restore registers
pop esi
pop ebx
pop ebp ; restore frame pointer
ret