import { Bitset } from "./bitset";

const NON_WORD = /[^A-Za-z]/g;
const N_MONOGRAMS = 27; // 26 letters + one non-letter
const A_VALUE = "a".charCodeAt(0);

const normalize = (input: string) => {
  // "{" is the next ASCII value after "z"
  return input.toLowerCase().replace(NON_WORD, "{");
};

/** Returns all the digrams of a string as an array of digram indices
 *
 * Digrams are normalized by converting all word characters to lower case,
 * and treating all non-word characters as the same character. Digram indices
 * are the index of sorting resulting digrams alphabetically (with non-word
 * character after all word characters). So index 0 represents "aa" and index
 * 728 represents the repeated non-word character.
 */
export const digrams = (input: string) => {
  const padded = input.length < 2 ? input.padEnd(2, " ") : input;
  const normalized = normalize(padded);
  const out: number[] = Array(normalized.length - 1);
  for (let ix = 0; ix < input.length - 1; ix++) {
    const s = normalized.slice(ix, ix + 2);
    const a = s.charCodeAt(0) - A_VALUE;
    const b = s.charCodeAt(1) - A_VALUE;
    out[ix] = a * N_MONOGRAMS + b;
  }
  return out;
};

export const DigramBitset = (terms: string[] | string) => {
  const bitset = new Bitset({ maxSize: N_MONOGRAMS ** 2 });
  bitset.setAll(
    typeof terms === "string" ? digrams(terms) : terms.flatMap(digrams)
  );
  return bitset;
};
