Scripts/Fast File Read
< Scripts
Jump to navigation
Jump to search
Fast file Reading
Fast extraction of a relatively small amount of data from a large file is easy with J, using mapped files.
For example, extracting distinct ip addresses from a log file can be as follows:
J version: Extract distinct IP addresses following a label from a large file
findInFile=: 4 : 0 NB. find data in a file NB. x: label preceding data NB. y: the file name NB. the data is followed by a blank space JCHAR map_jmf_ 'file';y NB. mapped files realy speed things up nos=. x I.@:E. file NB. find the positions of the label in file ip=.(nos+/(#x)+i.16){file NB. matrix with maximum no. of columns unmap_jmf_ 'file' ~.({."0 1~ (i."1 &' '))ip NB. extract data from each line until a blank is found NB. return the unique values )
On a small Acer Aspire One J took about 0.65 secs to extract 16000 ip addresses and the 10 distinct ip addresses from a 38 meg file.
This program could have been written in C, saving perhaps 0.30 secs, but with a bit more effort.
test=: 3 : 0 file=:'testfile.2' out=.,(20000 2000$' '),.~' rhost=',"1(' '-.~"1(}:"1 (20000 16$,'.',"1~":>:?40 1$255))) out fwrite file label =.' rhost=' label findInFile file ferase file )
C version: Extract distinct IP addresses following a label from a large file
////////////////////////////////////////////////////// // // // Small C mmap() sample. // // Written by Martin Cyr. // // Feel free to change and distribute, but credit // // is always nice. If you use, I'd be pleased to // // hear from you at Spooles at GMail dot com. // // // ////////////////////////////////////////////////////// #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/fcntl.h> #define INITIAL_CAPACITY 25 #define IP_CHAR_LEN 16 #define LINE_CHAR_LEN 1024 #define PATTERN " rhost=" void showUsage(); int processFile(char*, char***); int addNextHost(char***, int, int*, char*, int); int countMatches(char*, char**, int); void printDestroyArray(char**, int); int main(int argc, char** argv) { char** hosts; int hostCount; if (argc <= 1) showUsage(argv[0]); else if (argc == 2) { hostCount = processFile(argv[1], &hosts); printDestroyArray(hosts, hostCount); free(hosts); } else showUsage(argv[0]); } void showUsage(char* filename) { printf("Usage: %s <filename>\n", filename); printf("\tParses the <filename> for occurences of rhost= \n"); printf("\tand sends everything to stdout\n"); } int countMatches(char* match, char** array, int count) { int i, ret = 0; for (i = 0; i < count; i++) { if (strcmp(match, array[i]) == 0) ret++; } return ret; } void printDestroyArray(char** array, int count) { int i; for (i = 0; i < count; i++) { printf("%s\n", array[i]); free(array[i]); } } int processFile(char* filename, char*** hosts) { int c; int hostCount = 0, hostMax = INITIAL_CAPACITY; int match = 0, patternLen = strlen(PATTERN); int fd; int result; int i; char* map; struct stat results; (*hosts) = (char**)malloc(hostMax * IP_CHAR_LEN * sizeof(char)); fd = open(filename, O_RDONLY); if (fd == -1) { perror("Error opening file"); exit(EXIT_FAILURE); } if (stat(filename, &results) != 0) { perror("Unable to get file stats"); exit(EXIT_FAILURE); } map = mmap(0, results.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (map == MAP_FAILED) { perror("Error mapping the file"); exit(EXIT_FAILURE); } for (i = 0; i < results.st_size; i++) { if (map[i] != PATTERN[match++]) match = 0; if (match == patternLen) { hostCount = addNextHost(hosts, hostCount, &hostMax, map, i+1); match = 0; } } if (munmap(map, results.st_size) == -1) { perror("Error unmapping the file"); } close(fd); return hostCount; } int addNextHost(char*** hosts, int hostNum, int* hostMax, char* map, int offset) { char host[IP_CHAR_LEN]; int pos = 0; if (hostNum > *hostMax) { *hostMax *= 2; *hosts = (char**)realloc(*hosts, IP_CHAR_LEN * (*hostMax) * sizeof(char)); } while ((map[offset+pos] != ' ') && (map[offset+pos] != '\n') && (map[offset+pos] != '\r') && (map[offset+pos] != '\t')) { host[pos] = map[offset+pos]; pos++; } host[pos] = 0; if ((pos > 0) && (countMatches(host, *hosts, hostNum) == 0)) { (*hosts)[hostNum] = (char*)calloc(IP_CHAR_LEN, sizeof(char)); strncpy((*hosts)[hostNum], host, pos); hostNum++; } return hostNum; }