*** src/backend/commands/analyze.c
--- /tmp/.diff_IHT3Qe	2008-05-09 19:38:06.000000000 +0200
***************
*** 1319,1330 ****
  			{
  				ArrayType  *arry;
  
! 				arry = construct_array(stats->stavalues[k],
! 									   stats->numvalues[k],
! 									   stats->attr->atttypid,
! 									   stats->attrtype->typlen,
! 									   stats->attrtype->typbyval,
! 									   stats->attrtype->typalign);
  				values[i++] = PointerGetDatum(arry);	/* stavaluesN */
  			}
  			else
--- 1319,1350 ----
  			{
  				ArrayType  *arry;
  
! 				/*
! 				 * XXX horrible hack - we're creating a pg_statistic tuple for
! 				 * a tsvector, but need to store an array of cstrings.
! 				 *
! 				 * Temporary measures...
! 				 */
! 				if (stats->stakind[0] == STATISTIC_KIND_MCL)
! 				{
! 					elog(NOTICE, "severly breaking stuff by brute force hackage");
! 					arry = construct_array(stats->stavalues[k],
! 										   stats->numvalues[k],
! 										   CSTRINGOID,
! 										   -2, /* typlen, -2 for cstring, per
! 												* comment from pg_type.h */
! 										   false,
! 										   'c');
! 				}
! 				else
! 				{
! 					arry = construct_array(stats->stavalues[k],
! 										   stats->numvalues[k],
! 										   stats->attr->atttypid,
! 										   stats->attrtype->typlen,
! 										   stats->attrtype->typbyval,
! 										   stats->attrtype->typalign);
! 				}
  				values[i++] = PointerGetDatum(arry);	/* stavaluesN */
  			}
  			else
*** src/backend/tsearch/Makefile
--- /tmp/.diff_wN6Neq	2008-05-09 19:38:06.000000000 +0200
***************
*** 19,25 ****
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
  	dict_simple.o dict_synonym.o dict_thesaurus.o \
  	dict_ispell.o regis.o spell.o \
! 	to_tsany.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
  	dict_simple.o dict_synonym.o dict_thesaurus.o \
  	dict_ispell.o regis.o spell.o \
! 	to_tsany.o ts_utils.o ts_typanalyze.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** src/backend/tsearch/ts_typanalyze.c
--- /tmp/.diff_yh1vAu	2008-05-09 19:38:06.000000000 +0200
***************
*** 0 ****
--- 1,313 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_typanalyze.c
+  *	  functions for gathering statistics from tsvector columns
+  *
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "tsearch/ts_type.h"
+ #include "commands/vacuum.h"
+ 
+ static void compute_tsvector_stats(VacAttrStats *stats,
+ 								   AnalyzeAttrFetchFunc fetchfunc,
+ 								   int samplerows,
+ 								   double totalrows);
+ 
+ /* swapInt copied from analyze.c */
+ #define swapInt(a,b)		do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
+ #define swapString(a,b)		do {char *_tmp; _tmp=a; a=b; b=_tmp;} while(0)
+ 
+ /* XXX devel */
+ #ifdef DEBUG
+ #define D(x) x
+ #else
+ #define D(x)
+ #endif
+ 
+ /*
+  *	ts_typanalyze -- a custom typanalyze function for tsvector columns
+  */
+ Datum
+ ts_typanalyze(PG_FUNCTION_ARGS)
+ {
+ 	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+ 	Form_pg_attribute attr = stats->attr;
+ 
+ 	/* If the attstattarget column is negative, use the default value */
+ 	/* NB: it is okay to scribble on stats->attr since it's a copy */
+ 	if (attr->attstattarget < 0)
+ 		attr->attstattarget = default_statistics_target;
+ 
+ 	stats->compute_stats = compute_tsvector_stats;
+ 	/* see comment about the choice of minrows from analyze.c */
+ 	stats->minrows = 300 * attr->attstattarget;
+ 
+ 	PG_RETURN_BOOL(true);
+ }
+ 
+ /*
+  *	compute_tsvector_stats() -- compute statistics for a tsvector column
+  *
+  *	This functions computes statistics that are useful for determining @@
+  *	operations selectivity, along with the fraction of non-null rows and
+  *	average width.
+  *
+  *	Instead of finding the most common values, as we do for most datatypes,
+  *	we're looking for the most common lexemes. This is more useful, because
+  *	there most probably won't be any two rows with the same tsvector and thus
+  *	the notion of a MCV is a bit bogus with this datatype. With a list of the
+  *	most common lexemes we can do a better job at figuring out @@ selectivity.
+  *
+  *	For the same reasons we assume that tsvector columns are unique when
+  *	determining the number of distinct values.
+  *
+  *	The algorithm to determine MCLs is the same as used by
+  *	compute_minimal_stats() to determine MCVs of a column
+  */
+ static void compute_tsvector_stats(VacAttrStats *stats,
+ 								   AnalyzeAttrFetchFunc fetchfunc,
+ 								   int samplerows,
+ 								   double totalrows)
+ {
+ 	int			i;
+ 	int			null_cnt = 0;
+ 	double		total_width = 0;
+ 	typedef struct
+ 	{
+ 		char	*lexeme;
+ 		/*
+ 		 * Lexemes are stored in tsvectors as non-null-terminated strings.
+ 		 * Need to remember length.
+ 		 */
+ 		int		length;
+ 		int		count;
+ 	} TrackItem;
+ 	TrackItem	*track;
+ 	int			track_cnt,
+ 				track_max;
+ 	int			num_mcl = stats->attr->attstattarget;
+ 	double		mincount;
+ 
+ 	D(elog(NOTICE, "Going through %d samplerows", samplerows));
+ 
+ 	/*
+ 	 * We track up to 100*n lexemes for an n-element MCL list
+ 	 * This needs to be a generous amount, because we go through all the
+ 	 * lexemes of a single document before advancing to another, and we should
+ 	 * have room to keep all lexemes from two consecutive documents on our
+ 	 * tracking list to mimick the behaviour of compute_miminal_stats()
+ 	 *
+ 	 * XXX be smarter about it, should really be "number of lexemes in longest
+ 	 * tsvector", not a blunt 100
+ 	 */
+ 	track_max = 100 * num_mcl;
+ 	track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
+ 	track_cnt = 0;
+ 
+ 	for (i = 0; i < samplerows; i++)
+ 	{
+ 		Datum		value;
+ 		TSVector 	vector;
+ 		WordEntry	*curentryptr;
+ 		char		*lexemesptr;
+ 		bool		isnull;
+ 		int			j;
+ 
+ 		vacuum_delay_point();
+ 
+ 		D(elog(NOTICE, "Samplerow %d", i));
+ 
+ 		value = fetchfunc(stats, i, &isnull);
+ 
+ 		/*
+ 		 * Check for null/nonnull.
+ 		 *
+ 		 * We are going do analyze each row regardless of its width and
+ 		 * because of this we don't need a nonnull_cnt - we can use
+ 		 *  (samplerows - null_cnt)
+ 		 */
+ 		if (isnull)
+ 		{
+ 			D(elog(NOTICE, "It's null"));
+ 			null_cnt++;
+ 			continue;
+ 		}
+ 
+ 		/*
+ 		 * Since it's a tsvector we have, we know it's varlena we need to use
+ 		 * VARSIZE to get the width
+ 		 *
+ 		 * XXX following tsvector_op.c, that uses VARSIZE on tsvectors (and not
+ 		 * VARSIZE_ANY) I use VARSIZE_4B (because we explicitly know what
+ 		 * datatype we are dealing with and it feels cleaner). Is this ok?
+ 		 */
+ 		total_width += VARSIZE_4B(DatumGetPointer(value));
+ 
+ 		/*
+ 		 * We loop through the lexemes in the tsvector and add them to our
+ 		 * tracking array. Add them as null-terminated strings, as we'll be
+ 		 * using them for generating a MCL array of cstrings for storing in the
+ 		 * catalog.
+ 		 *
+ 		 * Nb. very common words like 'the', or 'a' should never make it into
+ 		 * the tsvector when using a dictionary with a proper stopwords list
+ 		 */
+ 		vector = DatumGetTSVector(value);
+ 		lexemesptr = STRPTR(vector);
+ 		curentryptr = ARRPTR(vector);
+ 		D(elog(NOTICE, "Going through all the lexemes"));
+ 
+ 		for (j = 0; j < vector->size; j++)
+ 		{
+ 			bool		match;
+ 			int			firstcount1;
+ 			int k;
+ 
+ 			D(elog(NOTICE, "Lexeme '%*s' examined",
+ 				   curentryptr->len,
+ 				   lexemesptr + curentryptr->pos));
+ 
+ 			match = false;
+ 			firstcount1 = track_cnt;
+ 			for (k = 0; k < track_cnt; k++)
+ 			{
+ 				if (curentryptr->len == track[k].length &&
+ 					strncmp(lexemesptr + curentryptr->pos,
+ 							track[k].lexeme,
+ 							curentryptr->len) == 0)
+ 				{
+ 					D(elog(NOTICE, "Match found"));
+ 					match = true;
+ 					break;
+ 				}
+ 				if (k < firstcount1 && track[k].count == 1)
+ 					firstcount1 = k;
+ 			}
+ 
+ 			if (match)
+ 			{
+ 				/* Found a match */
+ 				track[k].count++;
+ 				/* This lexeme may now need to "bubble up" in the track list */
+ 				while (k > 0 && track[k].count > track[k - 1].count)
+ 				{
+ 					swapString(track[k].lexeme, track[k - 1].lexeme);
+ 					swapInt(track[k].length, track[k - 1].length);
+ 					swapInt(track[k].count, track[k - 1].count);
+ 					k--;
+ 				}
+ 			}
+ 			else
+ 			{
+ 				/* No match.  Insert at head of count-1 list */
+ 				if (track_cnt < track_max)
+ 					track_cnt++;
+ 				for (k = track_cnt - 1; k > firstcount1; k--)
+ 				{
+ 					track[k].lexeme = track[k - 1].lexeme;
+ 					track[k].length = track[k - 1].length;
+ 					track[k].count = track[k - 1].count;
+ 				}
+ 				if (firstcount1 < track_cnt)
+ 				{
+ 					track[firstcount1].lexeme = lexemesptr + curentryptr->pos;
+ 					track[firstcount1].length = curentryptr->len;
+ 					track[firstcount1].count = 1;
+ 				}
+ 			}
+ 			/* Advance to the next WordEntry in the tsvector */
+ 			curentryptr++;
+ 		}
+ 	}
+ 
+ 	/* print out found MCLs */
+ 	for (i = 0; i < track_cnt; i++)
+ 	{
+ 		D(elog(NOTICE, "Lexeme '%*s' has %d occurrencess",
+ 			   track[i].length,
+ 			   track[i].lexeme,
+ 			   track[i].count));
+ 	}
+ 
+ 	/* We can only compute real stats if we found some non-null values. */
+ 	if (null_cnt < samplerows)
+ 	{
+ 		stats->stats_valid = true;
+ 		/* Do the simple null-frac and width stats */
+ 		stats->stanullfrac = (double) null_cnt / (double) samplerows;
+ 		/* It's a tsvector, so it's of variable width - have to compute the average */
+ 		stats->stawidth = total_width / (double) (samplerows - null_cnt);
+ 
+ 		/* Assume it's a unique column */
+ 		stats->stadistinct = -1.0;
+ 
+ 
+ 		/*
+ 		 * Decide how many lexemes are worth storing as most-common lexemes. We
+ 		 * keep the lexemes, that appear in more than one per mil of the
+ 		 * documents, with a minimum of 2 occurrences. This is a bit arbitrary, of
+ 		 * course.
+ 		 */
+ 		mincount = totalrows * 0.001;
+ 
+ 		if (mincount < 2)
+ 			mincount = 2;
+ 
+ 		if (num_mcl > track_cnt)
+ 			num_mcl = track_cnt;
+ 
+ 		for (i = 0; i < num_mcl; i++)
+ 		{
+ 			if (track[i].count < mincount)
+ 			{
+ 				num_mcl = i;
+ 				break;
+ 			}
+ 		}
+ 
+ 		/* Generate MCL slot entry */
+ 		if (num_mcl > 0)
+ 		{
+ 			MemoryContext	old_context;
+ 			char			*buf;
+ 			Datum			*mcl_values;
+ 			float4			*mcl_freqs;
+ 
+ 			/* Must copy the target values into anl_context */
+ 			old_context = MemoryContextSwitchTo(stats->anl_context);
+ 			mcl_values = (Datum *) palloc(num_mcl * sizeof(Datum));
+ 			mcl_freqs = (float4 *) palloc(num_mcl * sizeof(float4));
+ 			for (i = 0; i < num_mcl; i++)
+ 			{
+ 				buf = (char *) palloc(track[i].length + 1); /* + 1 for '\0' */
+ 				memcpy(buf, track[i].lexeme, track[i].length);
+ 				buf[track[i].length] = '\0';
+ 				elog(NOTICE, "Adding lexeme '%s' to the MCL array, it has %d occurrences", buf, track[i].count);
+ 				mcl_values[i] = CStringGetDatum(buf);
+ 				mcl_freqs[i] = (double) track[i].count / (double) samplerows;
+ 			}
+ 			MemoryContextSwitchTo(old_context);
+ 
+ 			stats->stakind[0] = STATISTIC_KIND_MCL;
+ 			stats->staop[0] = 0; /* nothing useful to put here */
+ 			stats->stanumbers[0] = mcl_freqs;
+ 			stats->numnumbers[0] = num_mcl;
+ 			stats->stavalues[0] = mcl_values;
+ 			stats->numvalues[0] = num_mcl;
+ 		}
+ 	}
+ 	else
+ 	{
+ 		/* We found only nulls; assume the column is entirely null */
+ 		stats->stats_valid = true;
+ 		stats->stanullfrac = 1.0;
+ 		stats->stawidth = 0;		/* "unknown" */
+ 		stats->stadistinct = 0.0;	/* "unknown" */
+ 	}
+ 
+ 	/* We don't need to bother cleaning up any of our temporary palloc's */
+ }
*** src/include/catalog/pg_proc.h
--- /tmp/.diff_6AZKAK	2008-05-09 19:38:06.000000000 +0200
***************
*** 4415,4420 ****
--- 4415,4422 ----
  DESCR("I/O");
  DATA(insert OID = 3774 (  regdictionarysend PGNSP PGUID 12 1 0 f f t f i 1 17 "3769" _null_ _null_ _null_ regdictionarysend - _null_ _null_ ));
  DESCR("I/O");
+ DATA(insert OID = 3775 (  ts_typanalyze		PGNSP PGUID 12 1 0 f f t f i 1 16 "2281" _null_ _null_ _null_ ts_typanalyze - _null_ _null_));
+ DESCR("tsvector typanalyze");
  
  /* txid */
  DATA(insert OID = 2939 (  txid_snapshot_in			PGNSP PGUID 12 1  0 f f t f i 1 2970 "2275" _null_ _null_ _null_ txid_snapshot_in - _null_ _null_ ));
*** src/include/catalog/pg_statistic.h
--- /tmp/.diff_cuLgzY	2008-05-09 19:38:06.000000000 +0200
***************
*** 237,240 ****
--- 237,253 ----
   */
  #define STATISTIC_KIND_CORRELATION	3
  
+ /*
+  * XXX fix wording, state clearly what are we counting here
+  *
+  * A "most common lexemes" slot is similar to a "most common values" slot.
+  * It contains information about a column with a type of "tsvector".  staop is
+  * zero, stavalues contain the K most common lexeme occurences, where a "lexeme
+  * occurence" happens when a lexeme appears (possibly more than once) in the
+  * tsvector and is represented in stavalues by that particular lexeme.
+  * stanumbers contains the fraction of total row count in which given lexemes
+  * appear. As with a MCV slot, K may be chosen by the statistics collector.
+  */
+ #define STATISTIC_KIND_MCL	4
+ 
  #endif   /* PG_STATISTIC_H */
*** src/include/catalog/pg_type.h
--- /tmp/.diff_41XxPj	2008-05-09 19:38:06.000000000 +0200
***************
*** 543,549 ****
  DATA(insert OID = 2951 ( _uuid			PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ ));
  
  /* text search */
! DATA(insert OID = 3614 ( tsvector		PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - - i x f 0 -1 0 _null_ _null_ ));
  DESCR("text representation for text search");
  #define TSVECTOROID		3614
  DATA(insert OID = 3642 ( gtsvector		PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ ));
--- 543,549 ----
  DATA(insert OID = 2951 ( _uuid			PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ ));
  
  /* text search */
! DATA(insert OID = 3614 ( tsvector		PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - ts_typanalyze i x f 0 -1 0 _null_ _null_ ));
  DESCR("text representation for text search");
  #define TSVECTOROID		3614
  DATA(insert OID = 3642 ( gtsvector		PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ ));
*** src/include/tsearch/ts_type.h
--- /tmp/.diff_2hwlqn	2008-05-09 19:38:07.000000000 +0200
***************
*** 153,158 ****
--- 153,159 ----
  extern Datum ts_rankcd_ttf(PG_FUNCTION_ARGS);
  extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);
  
+ extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
  
  /*
   * TSQuery