发信人: zhch (zhch), 信区: BBSDev
标 题: 一个粗略的BBS快速全站全文离线搜索程序
发信站: 南京大学小百合站 (Sun Jun 23 13:28:23 2002), 站内信件
这是一个很粗略的代码。和一般web全文搜索引擎算法差不多,以空间换时间,把所有
的可能先穷尽一遍,关键字长度2-12字节。可以快速全文搜索全站文章。缺点是占用
空间太大,数据大小会膨胀100倍,添加数据速度也很慢,算法还有待改进。
程序1: 添加数据.
#include "/home/bbs/bbssrc/include/bbs.h"
struct dir2 {
char board[24];
char title[60];
int filetime;
};
int main(int n, char *cmd[]) {
int i;
if(n<2) {
printf("usage: add3 board\n");
exit(0);
}
chdir(BBSHOME);
add_dir(cmd[1]);
}
int strsncpy(char *s, char *s2, int len) {
if(len<0) len=0;
strncpy(s, s2, len);
if(len-1>0) s[len-1]=0;
}
int add_dir(char *board) {
FILE *fp, *fp2;
struct fileheader x;
struct dir2 x2;
char path[256];
sprintf(path, "boards/%s/.DIR", board);
fp=fopen(path, "r");
if(!fp) exit(0);
printf("add_dir: %s\n", board);
fp2=fopen("/search/.DIR2", "a");
while(1) {
int i, pos;
char file[256];
if(fread(&x, sizeof(struct fileheader), 1, fp)<=0) break;
bzero(&x2, sizeof(struct dir2));
strsncpy(x2.board, board, 24);
strsncpy(x2.title, x.title, 60);
x2.filetime=atoi(x.filename+2);
pos=ftell(fp2)/sizeof(struct dir2);
fwrite(&x2, sizeof(struct dir2), 1, fp2);
sprintf(file, "boards/%s/%s", board, x.filename);
for(i=2; i<=12; i++) {
aaa(pos, file, i);
}
}
fclose(fp);
fclose(fp2);
}
int hash(unsigned char *s) {
unsigned int z=0;
int i;
for(i=0; s[i]; i++) {
int xx=toupper(s[i]);
z=z*79+xx;
}
return z%852;
}
int aaa(int pos, char *file, int len) {
FILE *fp, *fp2;
char path[256], buf[256];
int flag[1000];
struct fileheader x;
bzero(flag, 4000);
fp=fopen(file, "r");
if(!fp) exit(0);
printf("aa %d %s %d\n", pos, file, len);
while(1) {
int i, j;
if(fgets(buf, 255, fp)==0) break;
for(i=0; buf[i]; i++) {
if(buf[i]==10 || buf[i]==13) buf[i]=0;
if(buf[i]>0 && buf[i]<32) buf[i]=' ';
}
if(buf[0]==':' && buf[1]==' ') continue;
for(i=0; buf[i]; i++) {
char path2[256], tmp[20], out[80];
int h;
bzero(tmp, 20);
for(j=0; j<len; j++) if(buf[i+j]==0) goto L;
strncpy(tmp, buf+i, len);
h=hash(tmp);
if(flag[h]) continue;
flag[h]=1;
sprintf(path2, "/search/%d/%d", len, h);
fp2=fopen(path2, "a");
fwrite(&pos, sizeof(int), 1, fp2);
fwrite(tmp, 1, len, fp2);
fclose(fp2);
}
L:
}
fclose(fp);
}
程序2: 查找数据.
#include "/home/bbs/bbssrc/include/bbs.h"
struct dir2 {
char board[24];
char title[60];
int filetime;
};
int main(int n, char *cmd[]) {
if(n<=1) {
printf("usage: find3 xxx\n");
exit(0);
}
if(strlen(cmd[1])<2 || strlen(cmd[1])>12) {
printf("len must > 1 and <13\n");
exit(0);
}
find3(cmd[1]);
}
int hash(unsigned char *s) {
unsigned int z=0;
int i;
for(i=0; s[i]; i++) {
int xx=toupper(s[i]);
z=z*79+xx;
}
return z%852;
}
int find3(char *s) {
FILE *fp;
char file[256];
char tmp[80];
int pos, last=-1;
int len=strlen(s);
sprintf(file, "/search/%d/%d", len, hash(s));
printf("len=%d\n", len);
printf("hash=%d\n", hash(s));
printf("s=%s\n", s);
fp=fopen(file, "r");
if(!fp) {
printf("not found!\n");
exit(0);
}
while(1) {
if(fread(&pos, sizeof(int), 1, fp)<=0) break;
if(fread(tmp, len, 1, fp)<=0) break;
if(!strncasecmp(s, tmp, len)) {
show(pos);
last=pos;
}
}
fclose(fp);
}
int show(int pos) {
FILE *fp;
struct dir2 x;
bzero(&x, sizeof(struct dir2));
fp=fopen("/search/.DIR2", "r");
if(!fp) exit(0);
fseek(fp, pos*sizeof(struct dir2), SEEK_SET);
fread(&x, sizeof(struct dir2), 1, fp);
printf("file=%d, url=http://bbs.nju.edu.cn/bbscon?board=%s&file=M.%d.
A\n", pos+1, x.board, x.filetime);
fclose(fp);
}
--
FROM 166.111.176.221