Table of Contents

Procedure to add a new code-page to john

This is for developers, or other advanced users. So, you have a set of hashes, and they are in a code page for some language which John does not support. Well, john 'can' have code pages added to it's support.

The code pages I know work, are 8 bit (256 character), and left to right reading direction. I do not know if other reading directions can be added. John does already have utf-8 support, but since this is a variable sized font/codepage, there are many things within john, which do not work. For one, some rules will not work well with utf8.

However, with a little work, a character set CAN be fully added to john. This help page was written while adding code page CP866 (DOS/LM Russian). This shows the proper way to make sure that all items are done properly, and all features of john work with the new code page.

Adding a new code page (prelim)

Hint to search for prior CodePage additions

Obtaining the data for the Code-Page to Unicode conversion

perl Unicode/cmpt_cp.pl -v cp866 

// here is the CP866 to Unicode conversion for CP866 characters from 0x80 to 0xFF
static UTF16 CP866_to_unicode_high128[] = {
0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,0x041F,
0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,0x042F,
0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,0x043F,
0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255D,0x255C,0x255B,0x2510,
0x2514,0x2534,0x252C,0x251C,0x2500,0x253C,0x255E,0x255F,0x255A,0x2554,0x2569,0x2566,0x2560,0x2550,0x256C,0x2567,
0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256B,0x256A,0x2518,0x250C,0x2588,0x2584,0x258C,0x2590,0x2580,
0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,0x044E,0x044F,
0x0401,0x0451,0x0404,0x0454,0x0407,0x0457,0x040E,0x045E,0x00B0,0x2219,0x00B7,0x221A,0x2116,0x00A4,0x25A0,0x00A0 };
#define CHARS_LOWER_CP866 \
	"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF1\xF3\xF5\xF7"
#define CHARS_LOW_ONLY_CP866
#define CHARS_UPPER_CP866 \
	"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xF0\xF2\xF4\xF6"
#define CHARS_UP_ONLY_CP866
#define CHARS_DIGITS_CP866
#define CHARS_PUNCTUATION_CP866 "\xFA"
#define CHARS_SPECIALS_CP866 \
	"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xF8\xF9\xFB\xFC\xFD\xFE"
#define CHARS_ALPHA_CP866 \
	"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7"
#define CHARS_WHITESPACE_CP866 "\xFF"
#define CHARS_CONTROL_CP866
#define CHARS_INVALID_CP866 ""
#define CHARS_VOWELS_CP866 \
	"\x59\x79\x80\x85\x88\x89\x8E\x93\x9B\x9D\x9E\x9F\xA0\xA5\xA8\xA9\xAE\xE3\xEB\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7"
#define CHARS_CONSONANTS_CP866 \
	"\x81\x82\x83\x84\x86\x87\x8A\x8B\x8C\x8D\x8F\x90\x91\x92\x94\x95\x96\x97\x98\x99\x9A\x9C\xA1\xA2\xA3\xA4\xA6\xA7\xAA\xAB\xAC\xAD\xAF\xE0\xE1\xE2\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEC"

Linking the Unicode conversion into john

* Edit ./src/rules.c Change this code:

// this will 'pacify' compiler warnings.
UTF16 *JunkStuff[] = {KOI8_r_to_unicode_high128, CP1251_to_unicode_high128};

* to this. We add our new variable to this 'junk' array. This simply pacifies the compiler warnings.

// this will 'pacify' compiler warnings.
UTF16 *JunkStuff[] = {KOI8_r_to_unicode_high128, CP1251_to_unicode_high128, CP866_to_unicode_high128};

* Now, within ./src/unicode.c We are going to modify some code in the initUnicode() function * The 2 blocks of code we are going to modify are right under the comment: “Here we setup the 8-bit codepages we handle, and setup the mapping values into Unicode.” * this shows the changed code, for our CP866 data:

	// Here we setup the 8-bit codepages we handle, and setup the mapping values into Unicode.
	for (i = 0; i < 128; ++i) {
		CP_to_Unicode[i] = i;
	}
	for (i = 128; i < 256; ++i) {
		if (options.cp1251)
			CP_to_Unicode[i] = CP1251_to_unicode_high128[i-128];
		else if (options.koi8_r)
			CP_to_Unicode[i] = KOI8_r_to_unicode_high128[i-128];
+		else if (options.cp866)
+			CP_to_Unicode[i] = CP866_to_unicode_high128[i-128];
		else
			CP_to_Unicode[i] = i;
	}
	for (i = 0; i < 0x10000; ++i) 
		CP_from_Unicode[i] = i;  // will truncate to lower 8 bits.
	for (i = 0; i < 128; ++i) {
		if (options.cp1251)
			CP_from_Unicode[CP1251_to_unicode_high128[i]] = i+128;
		else if (options.koi8r)
			CP_from_Unicode[KOI8_r_to_unicode_high128[i]] = i+128;
+		else if (options.cp866)
+			CP_from_Unicode[CP866_to_unicode_high128[i]] = i+128;
		else {
			// for iso-8859-1, this is the only change to 'straight' 0 to 0xFF -> 0 to 0xFF.
			CP_from_Unicode[0x39C] = 0xB5;
			break;
		}
	}

Adding the command line options and help into john

 /* wordfile character encoding 'stuff' */
	int encoding_7_bit;  // if NO other charset is used, we set this to 1.  This tells us to user 7 bit ASCII.
	int utf8;
	int iso8859_1;
	int koi8_r;
	int cp1251;
+	int cp866;
 };
-	options.utf8 = options.iso8859_1 = options.koi8_r = options.cp1251 = 0;
+	options.utf8 = options.iso8859_1 = options.koi8_r = options.cp1251 = options.cp866 = 0;
	// by 'default' we are setup in 7 bit ascii mode (for rules).
	options.encoding_7_bit = 1;
	if ( (options.flags & FLG_INP_ENCODING) && options.encoding) {
		// Ok, check a 'few' valid things for utf8
		options.encoding_7_bit = 0;
		if (!strcasecmp(options.encoding, "utf8")||!strcasecmp(options.encoding, "utf-8"))
			options.utf8 = 1;
		else if (!strcasecmp(options.encoding, "ansi")||!strcasecmp(options.encoding, "iso-8859-1")||!strcasecmp(options.encoding, "8859-1"))
			options.iso8859_1 = 1;
		else if (!strcasecmp(options.encoding, "koi8-r")||!strcasecmp(options.encoding, "koi8r"))
			options.koi8_r = 1;
		else if (!strcasecmp(options.encoding, "cp1251")||!strcasecmp(options.encoding, "cp-1251"))
			options.cp1251 = 1;
+		else if (!strcasecmp(options.encoding, "cp866")||!strcasecmp(options.encoding, "cp-866"))
+			options.cp866 = 1;
		else {
			fprintf (stderr, "Supported encodings within john are: raw, utf-8, iso-8859-1 (or ansi)"
					", koi8-r"
					", cp1251"
+					", cp866"
					"\n");
			error();
		}
	}

First test build

If there are problems, then by all means FIX THEM NOW, while the changes are small. NOTE, you will NOT use CP866 for your variable names. Pick variable names that are unused, but that self comment what the variable actually contains.

Adding the 'casing' data to ./src/unicode.c

	// now handle upper 128 byte values for casing.
	if (options.koi8_r) {
		cpU = (unsigned char*)CHARS_UPPER_KOI8_R; cpL = (unsigned char*)CHARS_LOWER_KOI8_R;
	} else if (options.cp1251) {
		cpU = (unsigned char*)CHARS_UPPER_CP1251; cpL = (unsigned char*)CHARS_LOWER_CP1251;
+	} else if (options.cp866) {
+		cpU = (unsigned char*)CHARS_UPPER_CP866; cpL = (unsigned char*)CHARS_LOWER_CP866;
	} else {
		cpU = (unsigned char*)CHARS_UPPER_8859_1; cpL = (unsigned char*)CHARS_LOWER_8859_1;
	}

Adding the 'casing' data to ./src/rules.c

static void rules_init_classes(void)
{
	static unsigned char eightbitchars[129];
	int i;
	memset(rules_classes, 0, sizeof(rules_classes));
 
	// this is an ugly hack but it works fine, used for 'b' below
	for(i=0;i<128;i++)
		eightbitchars[i] = i+128;
	eightbitchars[128] = 0;
 
	rules_init_class('?', "?");
	rules_init_class('v', "aeiouAEIOU");
	rules_init_class('c', "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ");
	rules_init_class('w', " \t");
	rules_init_class('p', ".,:;'\"?!`");
	rules_init_class('s', "$%^&*()-_+=|\\<>[]{}#@/~");
	if (options.iso8859_1) {
		rules_init_class('l', CHARS_LOWER CHARS_LOWER_8859_1);
		rules_init_class('u', CHARS_UPPER CHARS_UPPER_8859_1);
	} else if (options.koi8_r) {
		rules_init_class('l', CHARS_LOWER CHARS_LOWER_KOI8_R);
		rules_init_class('u', CHARS_UPPER CHARS_UPPER_KOI8_R);
	} else if (options.cp1251) {
		rules_init_class('l', CHARS_LOWER CHARS_LOWER_CP1251);
		rules_init_class('u', CHARS_UPPER CHARS_UPPER_CP1251);
+	} else if (options.cp866) {
+		rules_init_class('l', CHARS_LOWER CHARS_LOWER_CP866);
+		rules_init_class('u', CHARS_UPPER CHARS_UPPER_CP866);
	} else {
		rules_init_class('l', CHARS_LOWER);
		rules_init_class('u', CHARS_UPPER);
	}
	rules_init_class('d', CHARS_DIGITS);
	if (options.iso8859_1) {
		rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_8859_1 CHARS_UPPER_8859_1);
		rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_8859_1 CHARS_UPPER_8859_1 CHARS_DIGITS);
	} else if (options.koi8_r) {
		rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_KOI8_R CHARS_UPPER_KOI8_R);
		rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_KOI8_R CHARS_UPPER_KOI8_R CHARS_DIGITS);
	} else if (options.cp1251) {
		rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP1251 CHARS_UPPER_CP1251);
		rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP1251 CHARS_UPPER_CP1251 CHARS_DIGITS);
+	} else if (options.cp866) {
+		rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP866 CHARS_UPPER_CP866);
+		rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP866 CHARS_UPPER_CP866 CHARS_DIGITS);
	} else {
		rules_init_class('a', CHARS_LOWER CHARS_UPPER);
		rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_DIGITS);
	}
	rules_init_class('b', (char *)&eightbitchars);
	rules_init_class('Z', "");
}
static void rules_init_convs(void)
{
	conv_shift = rules_init_conv(conv_source, CONV_SHIFT);
	conv_invert = rules_init_conv(conv_source, CONV_INVERT);
	conv_vowels = rules_init_conv(conv_source, CONV_VOWELS);
	conv_right = rules_init_conv(conv_source, CONV_RIGHT);
	conv_left = rules_init_conv(conv_source, CONV_LEFT);
 
	if (options.iso8859_1) {
		conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_8859_1, CHARS_LOWER CHARS_LOWER_8859_1);
		conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_8859_1, CHARS_UPPER CHARS_UPPER_8859_1);
	} else if (options.koi8_r) {
		conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_KOI8_R, CHARS_LOWER CHARS_LOWER_KOI8_R);
		conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_KOI8_R, CHARS_UPPER CHARS_UPPER_KOI8_R);
	} else if (options.cp1251) {
		conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_CP1251, CHARS_LOWER CHARS_LOWER_CP1251);
		conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_CP1251, CHARS_UPPER CHARS_UPPER_CP1251);
+	} else if (options.cp866) {
+		conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_CP866, CHARS_LOWER CHARS_LOWER_CP866);
+		conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_CP866, CHARS_UPPER CHARS_UPPER_CP866);
	} else {
		conv_tolower = rules_init_conv(CHARS_UPPER, CHARS_LOWER);
		conv_toupper = rules_init_conv(CHARS_LOWER, CHARS_UPPER);
	}
}

The encoding data should now be complete, within john

CONGRATULATIONS you have added codepage CP866 to john.

Building Test cases (in john test suite)

Testing your work

What if something does not test right?

Creating the patch to share with others in the john community

Handling the WARNING, char at ord(0xHH) U+HHHH needs to be looked into

Handling CP's if the lower 128 chars are NOT the same as ASCII lower 128