This is for developers, or other advanced users. So, you have a set of hashes, and they are in a code page for some language which John does not support. Well, john 'can' have code pages added to it's support.
The code pages I know work, are 8 bit (256 character), and left to right reading direction. I do not know if other reading directions can be added. John does already have utf-8 support, but since this is a variable sized font/codepage, there are many things within john, which do not work. For one, some rules will not work well with utf8.
However, with a little work, a character set CAN be fully added to john. This help page was written while adding code page CP866 (DOS/LM Russian). This shows the proper way to make sure that all items are done properly, and all features of john work with the new code page.
perl Unicode/cmpt_cp.pl -v cp866 // here is the CP866 to Unicode conversion for CP866 characters from 0x80 to 0xFF static UTF16 CP866_to_unicode_high128[] = { 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,0x041F, 0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,0x042F, 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,0x043F, 0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255D,0x255C,0x255B,0x2510, 0x2514,0x2534,0x252C,0x251C,0x2500,0x253C,0x255E,0x255F,0x255A,0x2554,0x2569,0x2566,0x2560,0x2550,0x256C,0x2567, 0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256B,0x256A,0x2518,0x250C,0x2588,0x2584,0x258C,0x2590,0x2580, 0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,0x044E,0x044F, 0x0401,0x0451,0x0404,0x0454,0x0407,0x0457,0x040E,0x045E,0x00B0,0x2219,0x00B7,0x221A,0x2116,0x00A4,0x25A0,0x00A0 }; #define CHARS_LOWER_CP866 \ "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF1\xF3\xF5\xF7" #define CHARS_LOW_ONLY_CP866 #define CHARS_UPPER_CP866 \ "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xF0\xF2\xF4\xF6" #define CHARS_UP_ONLY_CP866 #define CHARS_DIGITS_CP866 #define CHARS_PUNCTUATION_CP866 "\xFA" #define CHARS_SPECIALS_CP866 \ "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xF8\xF9\xFB\xFC\xFD\xFE" #define CHARS_ALPHA_CP866 \ "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7" #define CHARS_WHITESPACE_CP866 "\xFF" #define CHARS_CONTROL_CP866 #define CHARS_INVALID_CP866 "" #define CHARS_VOWELS_CP866 \ "\x59\x79\x80\x85\x88\x89\x8E\x93\x9B\x9D\x9E\x9F\xA0\xA5\xA8\xA9\xAE\xE3\xEB\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7" #define CHARS_CONSONANTS_CP866 \ "\x81\x82\x83\x84\x86\x87\x8A\x8B\x8C\x8D\x8F\x90\x91\x92\x94\x95\x96\x97\x98\x99\x9A\x9C\xA1\xA2\xA3\xA4\xA6\xA7\xAA\xAB\xAC\xAD\xAF\xE0\xE1\xE2\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEC"
* Edit ./src/rules.c Change this code:
// this will 'pacify' compiler warnings. UTF16 *JunkStuff[] = {KOI8_r_to_unicode_high128, CP1251_to_unicode_high128};
* to this. We add our new variable to this 'junk' array. This simply pacifies the compiler warnings.
// this will 'pacify' compiler warnings. UTF16 *JunkStuff[] = {KOI8_r_to_unicode_high128, CP1251_to_unicode_high128, CP866_to_unicode_high128};
* Now, within ./src/unicode.c We are going to modify some code in the initUnicode() function * The 2 blocks of code we are going to modify are right under the comment: “Here we setup the 8-bit codepages we handle, and setup the mapping values into Unicode.” * this shows the changed code, for our CP866 data:
// Here we setup the 8-bit codepages we handle, and setup the mapping values into Unicode. for (i = 0; i < 128; ++i) { CP_to_Unicode[i] = i; } for (i = 128; i < 256; ++i) { if (options.cp1251) CP_to_Unicode[i] = CP1251_to_unicode_high128[i-128]; else if (options.koi8_r) CP_to_Unicode[i] = KOI8_r_to_unicode_high128[i-128]; + else if (options.cp866) + CP_to_Unicode[i] = CP866_to_unicode_high128[i-128]; else CP_to_Unicode[i] = i; } for (i = 0; i < 0x10000; ++i) CP_from_Unicode[i] = i; // will truncate to lower 8 bits. for (i = 0; i < 128; ++i) { if (options.cp1251) CP_from_Unicode[CP1251_to_unicode_high128[i]] = i+128; else if (options.koi8r) CP_from_Unicode[KOI8_r_to_unicode_high128[i]] = i+128; + else if (options.cp866) + CP_from_Unicode[CP866_to_unicode_high128[i]] = i+128; else { // for iso-8859-1, this is the only change to 'straight' 0 to 0xFF -> 0 to 0xFF. CP_from_Unicode[0x39C] = 0xB5; break; } }
/* wordfile character encoding 'stuff' */ int encoding_7_bit; // if NO other charset is used, we set this to 1. This tells us to user 7 bit ASCII. int utf8; int iso8859_1; int koi8_r; int cp1251; + int cp866; };
- options.utf8 = options.iso8859_1 = options.koi8_r = options.cp1251 = 0; + options.utf8 = options.iso8859_1 = options.koi8_r = options.cp1251 = options.cp866 = 0; // by 'default' we are setup in 7 bit ascii mode (for rules). options.encoding_7_bit = 1; if ( (options.flags & FLG_INP_ENCODING) && options.encoding) { // Ok, check a 'few' valid things for utf8 options.encoding_7_bit = 0; if (!strcasecmp(options.encoding, "utf8")||!strcasecmp(options.encoding, "utf-8")) options.utf8 = 1; else if (!strcasecmp(options.encoding, "ansi")||!strcasecmp(options.encoding, "iso-8859-1")||!strcasecmp(options.encoding, "8859-1")) options.iso8859_1 = 1; else if (!strcasecmp(options.encoding, "koi8-r")||!strcasecmp(options.encoding, "koi8r")) options.koi8_r = 1; else if (!strcasecmp(options.encoding, "cp1251")||!strcasecmp(options.encoding, "cp-1251")) options.cp1251 = 1; + else if (!strcasecmp(options.encoding, "cp866")||!strcasecmp(options.encoding, "cp-866")) + options.cp866 = 1; else { fprintf (stderr, "Supported encodings within john are: raw, utf-8, iso-8859-1 (or ansi)" ", koi8-r" ", cp1251" + ", cp866" "\n"); error(); } }
If there are problems, then by all means FIX THEM NOW, while the changes are small. NOTE, you will NOT use CP866 for your variable names. Pick variable names that are unused, but that self comment what the variable actually contains.
// now handle upper 128 byte values for casing. if (options.koi8_r) { cpU = (unsigned char*)CHARS_UPPER_KOI8_R; cpL = (unsigned char*)CHARS_LOWER_KOI8_R; } else if (options.cp1251) { cpU = (unsigned char*)CHARS_UPPER_CP1251; cpL = (unsigned char*)CHARS_LOWER_CP1251; + } else if (options.cp866) { + cpU = (unsigned char*)CHARS_UPPER_CP866; cpL = (unsigned char*)CHARS_LOWER_CP866; } else { cpU = (unsigned char*)CHARS_UPPER_8859_1; cpL = (unsigned char*)CHARS_LOWER_8859_1; }
static void rules_init_classes(void) { static unsigned char eightbitchars[129]; int i; memset(rules_classes, 0, sizeof(rules_classes)); // this is an ugly hack but it works fine, used for 'b' below for(i=0;i<128;i++) eightbitchars[i] = i+128; eightbitchars[128] = 0; rules_init_class('?', "?"); rules_init_class('v', "aeiouAEIOU"); rules_init_class('c', "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ"); rules_init_class('w', " \t"); rules_init_class('p', ".,:;'\"?!`"); rules_init_class('s', "$%^&*()-_+=|\\<>[]{}#@/~"); if (options.iso8859_1) { rules_init_class('l', CHARS_LOWER CHARS_LOWER_8859_1); rules_init_class('u', CHARS_UPPER CHARS_UPPER_8859_1); } else if (options.koi8_r) { rules_init_class('l', CHARS_LOWER CHARS_LOWER_KOI8_R); rules_init_class('u', CHARS_UPPER CHARS_UPPER_KOI8_R); } else if (options.cp1251) { rules_init_class('l', CHARS_LOWER CHARS_LOWER_CP1251); rules_init_class('u', CHARS_UPPER CHARS_UPPER_CP1251); + } else if (options.cp866) { + rules_init_class('l', CHARS_LOWER CHARS_LOWER_CP866); + rules_init_class('u', CHARS_UPPER CHARS_UPPER_CP866); } else { rules_init_class('l', CHARS_LOWER); rules_init_class('u', CHARS_UPPER); } rules_init_class('d', CHARS_DIGITS); if (options.iso8859_1) { rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_8859_1 CHARS_UPPER_8859_1); rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_8859_1 CHARS_UPPER_8859_1 CHARS_DIGITS); } else if (options.koi8_r) { rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_KOI8_R CHARS_UPPER_KOI8_R); rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_KOI8_R CHARS_UPPER_KOI8_R CHARS_DIGITS); } else if (options.cp1251) { rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP1251 CHARS_UPPER_CP1251); rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP1251 CHARS_UPPER_CP1251 CHARS_DIGITS); + } else if (options.cp866) { + rules_init_class('a', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP866 CHARS_UPPER_CP866); + rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_LOWER_CP866 CHARS_UPPER_CP866 CHARS_DIGITS); } else { rules_init_class('a', CHARS_LOWER CHARS_UPPER); rules_init_class('x', CHARS_LOWER CHARS_UPPER CHARS_DIGITS); } rules_init_class('b', (char *)&eightbitchars); rules_init_class('Z', ""); }
static void rules_init_convs(void) { conv_shift = rules_init_conv(conv_source, CONV_SHIFT); conv_invert = rules_init_conv(conv_source, CONV_INVERT); conv_vowels = rules_init_conv(conv_source, CONV_VOWELS); conv_right = rules_init_conv(conv_source, CONV_RIGHT); conv_left = rules_init_conv(conv_source, CONV_LEFT); if (options.iso8859_1) { conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_8859_1, CHARS_LOWER CHARS_LOWER_8859_1); conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_8859_1, CHARS_UPPER CHARS_UPPER_8859_1); } else if (options.koi8_r) { conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_KOI8_R, CHARS_LOWER CHARS_LOWER_KOI8_R); conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_KOI8_R, CHARS_UPPER CHARS_UPPER_KOI8_R); } else if (options.cp1251) { conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_CP1251, CHARS_LOWER CHARS_LOWER_CP1251); conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_CP1251, CHARS_UPPER CHARS_UPPER_CP1251); + } else if (options.cp866) { + conv_tolower = rules_init_conv(CHARS_UPPER CHARS_UPPER_CP866, CHARS_LOWER CHARS_LOWER_CP866); + conv_toupper = rules_init_conv(CHARS_LOWER CHARS_LOWER_CP866, CHARS_UPPER CHARS_UPPER_CP866); } else { conv_tolower = rules_init_conv(CHARS_UPPER, CHARS_LOWER); conv_toupper = rules_init_conv(CHARS_LOWER, CHARS_UPPER); } }
CONGRATULATIONS you have added codepage CP866 to john.