*** src/backend/commands/analyze.c --- /tmp/.diff_IHT3Qe 2008-05-09 19:38:06.000000000 +0200 *************** *** 1319,1330 **** { ArrayType *arry; ! arry = construct_array(stats->stavalues[k], ! stats->numvalues[k], ! stats->attr->atttypid, ! stats->attrtype->typlen, ! stats->attrtype->typbyval, ! stats->attrtype->typalign); values[i++] = PointerGetDatum(arry); /* stavaluesN */ } else --- 1319,1350 ---- { ArrayType *arry; ! /* ! * XXX horrible hack - we're creating a pg_statistic tuple for ! * a tsvector, but need to store an array of cstrings. ! * ! * Temporary measures... ! */ ! if (stats->stakind[0] == STATISTIC_KIND_MCL) ! { ! elog(NOTICE, "severly breaking stuff by brute force hackage"); ! arry = construct_array(stats->stavalues[k], ! stats->numvalues[k], ! CSTRINGOID, ! -2, /* typlen, -2 for cstring, per ! * comment from pg_type.h */ ! false, ! 'c'); ! } ! else ! { ! arry = construct_array(stats->stavalues[k], ! stats->numvalues[k], ! stats->attr->atttypid, ! stats->attrtype->typlen, ! stats->attrtype->typbyval, ! stats->attrtype->typalign); ! } values[i++] = PointerGetDatum(arry); /* stavaluesN */ } else *** src/backend/tsearch/Makefile --- /tmp/.diff_wN6Neq 2008-05-09 19:38:06.000000000 +0200 *************** *** 19,25 **** OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ ! to_tsany.o ts_utils.o include $(top_srcdir)/src/backend/common.mk --- 19,25 ---- OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ ! to_tsany.o ts_utils.o ts_typanalyze.o include $(top_srcdir)/src/backend/common.mk *** src/backend/tsearch/ts_typanalyze.c --- /tmp/.diff_yh1vAu 2008-05-09 19:38:06.000000000 +0200 *************** *** 0 **** --- 1,313 ---- + /*------------------------------------------------------------------------- + * + * ts_typanalyze.c + * functions for gathering statistics from tsvector columns + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "tsearch/ts_type.h" + #include "commands/vacuum.h" + + static void compute_tsvector_stats(VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); + + /* swapInt copied from analyze.c */ + #define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0) + #define swapString(a,b) do {char *_tmp; _tmp=a; a=b; b=_tmp;} while(0) + + /* XXX devel */ + #ifdef DEBUG + #define D(x) x + #else + #define D(x) + #endif + + /* + * ts_typanalyze -- a custom typanalyze function for tsvector columns + */ + Datum + ts_typanalyze(PG_FUNCTION_ARGS) + { + VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0); + Form_pg_attribute attr = stats->attr; + + /* If the attstattarget column is negative, use the default value */ + /* NB: it is okay to scribble on stats->attr since it's a copy */ + if (attr->attstattarget < 0) + attr->attstattarget = default_statistics_target; + + stats->compute_stats = compute_tsvector_stats; + /* see comment about the choice of minrows from analyze.c */ + stats->minrows = 300 * attr->attstattarget; + + PG_RETURN_BOOL(true); + } + + /* + * compute_tsvector_stats() -- compute statistics for a tsvector column + * + * This functions computes statistics that are useful for determining @@ + * operations selectivity, along with the fraction of non-null rows and + * average width. + * + * Instead of finding the most common values, as we do for most datatypes, + * we're looking for the most common lexemes. This is more useful, because + * there most probably won't be any two rows with the same tsvector and thus + * the notion of a MCV is a bit bogus with this datatype. With a list of the + * most common lexemes we can do a better job at figuring out @@ selectivity. + * + * For the same reasons we assume that tsvector columns are unique when + * determining the number of distinct values. + * + * The algorithm to determine MCLs is the same as used by + * compute_minimal_stats() to determine MCVs of a column + */ + static void compute_tsvector_stats(VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) + { + int i; + int null_cnt = 0; + double total_width = 0; + typedef struct + { + char *lexeme; + /* + * Lexemes are stored in tsvectors as non-null-terminated strings. + * Need to remember length. + */ + int length; + int count; + } TrackItem; + TrackItem *track; + int track_cnt, + track_max; + int num_mcl = stats->attr->attstattarget; + double mincount; + + D(elog(NOTICE, "Going through %d samplerows", samplerows)); + + /* + * We track up to 100*n lexemes for an n-element MCL list + * This needs to be a generous amount, because we go through all the + * lexemes of a single document before advancing to another, and we should + * have room to keep all lexemes from two consecutive documents on our + * tracking list to mimick the behaviour of compute_miminal_stats() + * + * XXX be smarter about it, should really be "number of lexemes in longest + * tsvector", not a blunt 100 + */ + track_max = 100 * num_mcl; + track = (TrackItem *) palloc(track_max * sizeof(TrackItem)); + track_cnt = 0; + + for (i = 0; i < samplerows; i++) + { + Datum value; + TSVector vector; + WordEntry *curentryptr; + char *lexemesptr; + bool isnull; + int j; + + vacuum_delay_point(); + + D(elog(NOTICE, "Samplerow %d", i)); + + value = fetchfunc(stats, i, &isnull); + + /* + * Check for null/nonnull. + * + * We are going do analyze each row regardless of its width and + * because of this we don't need a nonnull_cnt - we can use + * (samplerows - null_cnt) + */ + if (isnull) + { + D(elog(NOTICE, "It's null")); + null_cnt++; + continue; + } + + /* + * Since it's a tsvector we have, we know it's varlena we need to use + * VARSIZE to get the width + * + * XXX following tsvector_op.c, that uses VARSIZE on tsvectors (and not + * VARSIZE_ANY) I use VARSIZE_4B (because we explicitly know what + * datatype we are dealing with and it feels cleaner). Is this ok? + */ + total_width += VARSIZE_4B(DatumGetPointer(value)); + + /* + * We loop through the lexemes in the tsvector and add them to our + * tracking array. Add them as null-terminated strings, as we'll be + * using them for generating a MCL array of cstrings for storing in the + * catalog. + * + * Nb. very common words like 'the', or 'a' should never make it into + * the tsvector when using a dictionary with a proper stopwords list + */ + vector = DatumGetTSVector(value); + lexemesptr = STRPTR(vector); + curentryptr = ARRPTR(vector); + D(elog(NOTICE, "Going through all the lexemes")); + + for (j = 0; j < vector->size; j++) + { + bool match; + int firstcount1; + int k; + + D(elog(NOTICE, "Lexeme '%*s' examined", + curentryptr->len, + lexemesptr + curentryptr->pos)); + + match = false; + firstcount1 = track_cnt; + for (k = 0; k < track_cnt; k++) + { + if (curentryptr->len == track[k].length && + strncmp(lexemesptr + curentryptr->pos, + track[k].lexeme, + curentryptr->len) == 0) + { + D(elog(NOTICE, "Match found")); + match = true; + break; + } + if (k < firstcount1 && track[k].count == 1) + firstcount1 = k; + } + + if (match) + { + /* Found a match */ + track[k].count++; + /* This lexeme may now need to "bubble up" in the track list */ + while (k > 0 && track[k].count > track[k - 1].count) + { + swapString(track[k].lexeme, track[k - 1].lexeme); + swapInt(track[k].length, track[k - 1].length); + swapInt(track[k].count, track[k - 1].count); + k--; + } + } + else + { + /* No match. Insert at head of count-1 list */ + if (track_cnt < track_max) + track_cnt++; + for (k = track_cnt - 1; k > firstcount1; k--) + { + track[k].lexeme = track[k - 1].lexeme; + track[k].length = track[k - 1].length; + track[k].count = track[k - 1].count; + } + if (firstcount1 < track_cnt) + { + track[firstcount1].lexeme = lexemesptr + curentryptr->pos; + track[firstcount1].length = curentryptr->len; + track[firstcount1].count = 1; + } + } + /* Advance to the next WordEntry in the tsvector */ + curentryptr++; + } + } + + /* print out found MCLs */ + for (i = 0; i < track_cnt; i++) + { + D(elog(NOTICE, "Lexeme '%*s' has %d occurrencess", + track[i].length, + track[i].lexeme, + track[i].count)); + } + + /* We can only compute real stats if we found some non-null values. */ + if (null_cnt < samplerows) + { + stats->stats_valid = true; + /* Do the simple null-frac and width stats */ + stats->stanullfrac = (double) null_cnt / (double) samplerows; + /* It's a tsvector, so it's of variable width - have to compute the average */ + stats->stawidth = total_width / (double) (samplerows - null_cnt); + + /* Assume it's a unique column */ + stats->stadistinct = -1.0; + + + /* + * Decide how many lexemes are worth storing as most-common lexemes. We + * keep the lexemes, that appear in more than one per mil of the + * documents, with a minimum of 2 occurrences. This is a bit arbitrary, of + * course. + */ + mincount = totalrows * 0.001; + + if (mincount < 2) + mincount = 2; + + if (num_mcl > track_cnt) + num_mcl = track_cnt; + + for (i = 0; i < num_mcl; i++) + { + if (track[i].count < mincount) + { + num_mcl = i; + break; + } + } + + /* Generate MCL slot entry */ + if (num_mcl > 0) + { + MemoryContext old_context; + char *buf; + Datum *mcl_values; + float4 *mcl_freqs; + + /* Must copy the target values into anl_context */ + old_context = MemoryContextSwitchTo(stats->anl_context); + mcl_values = (Datum *) palloc(num_mcl * sizeof(Datum)); + mcl_freqs = (float4 *) palloc(num_mcl * sizeof(float4)); + for (i = 0; i < num_mcl; i++) + { + buf = (char *) palloc(track[i].length + 1); /* + 1 for '\0' */ + memcpy(buf, track[i].lexeme, track[i].length); + buf[track[i].length] = '\0'; + elog(NOTICE, "Adding lexeme '%s' to the MCL array, it has %d occurrences", buf, track[i].count); + mcl_values[i] = CStringGetDatum(buf); + mcl_freqs[i] = (double) track[i].count / (double) samplerows; + } + MemoryContextSwitchTo(old_context); + + stats->stakind[0] = STATISTIC_KIND_MCL; + stats->staop[0] = 0; /* nothing useful to put here */ + stats->stanumbers[0] = mcl_freqs; + stats->numnumbers[0] = num_mcl; + stats->stavalues[0] = mcl_values; + stats->numvalues[0] = num_mcl; + } + } + else + { + /* We found only nulls; assume the column is entirely null */ + stats->stats_valid = true; + stats->stanullfrac = 1.0; + stats->stawidth = 0; /* "unknown" */ + stats->stadistinct = 0.0; /* "unknown" */ + } + + /* We don't need to bother cleaning up any of our temporary palloc's */ + } *** src/include/catalog/pg_proc.h --- /tmp/.diff_6AZKAK 2008-05-09 19:38:06.000000000 +0200 *************** *** 4415,4420 **** --- 4415,4422 ---- DESCR("I/O"); DATA(insert OID = 3774 ( regdictionarysend PGNSP PGUID 12 1 0 f f t f i 1 17 "3769" _null_ _null_ _null_ regdictionarysend - _null_ _null_ )); DESCR("I/O"); + DATA(insert OID = 3775 ( ts_typanalyze PGNSP PGUID 12 1 0 f f t f i 1 16 "2281" _null_ _null_ _null_ ts_typanalyze - _null_ _null_)); + DESCR("tsvector typanalyze"); /* txid */ DATA(insert OID = 2939 ( txid_snapshot_in PGNSP PGUID 12 1 0 f f t f i 1 2970 "2275" _null_ _null_ _null_ txid_snapshot_in - _null_ _null_ )); *** src/include/catalog/pg_statistic.h --- /tmp/.diff_cuLgzY 2008-05-09 19:38:06.000000000 +0200 *************** *** 237,240 **** --- 237,253 ---- */ #define STATISTIC_KIND_CORRELATION 3 + /* + * XXX fix wording, state clearly what are we counting here + * + * A "most common lexemes" slot is similar to a "most common values" slot. + * It contains information about a column with a type of "tsvector". staop is + * zero, stavalues contain the K most common lexeme occurences, where a "lexeme + * occurence" happens when a lexeme appears (possibly more than once) in the + * tsvector and is represented in stavalues by that particular lexeme. + * stanumbers contains the fraction of total row count in which given lexemes + * appear. As with a MCV slot, K may be chosen by the statistics collector. + */ + #define STATISTIC_KIND_MCL 4 + #endif /* PG_STATISTIC_H */ *** src/include/catalog/pg_type.h --- /tmp/.diff_41XxPj 2008-05-09 19:38:06.000000000 +0200 *************** *** 543,549 **** DATA(insert OID = 2951 ( _uuid PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ )); /* text search */ ! DATA(insert OID = 3614 ( tsvector PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - - i x f 0 -1 0 _null_ _null_ )); DESCR("text representation for text search"); #define TSVECTOROID 3614 DATA(insert OID = 3642 ( gtsvector PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ )); --- 543,549 ---- DATA(insert OID = 2951 ( _uuid PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ )); /* text search */ ! DATA(insert OID = 3614 ( tsvector PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - ts_typanalyze i x f 0 -1 0 _null_ _null_ )); DESCR("text representation for text search"); #define TSVECTOROID 3614 DATA(insert OID = 3642 ( gtsvector PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ )); *** src/include/tsearch/ts_type.h --- /tmp/.diff_2hwlqn 2008-05-09 19:38:07.000000000 +0200 *************** *** 153,158 **** --- 153,159 ---- extern Datum ts_rankcd_ttf(PG_FUNCTION_ARGS); extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS); + extern Datum ts_typanalyze(PG_FUNCTION_ARGS); /* * TSQuery