Browse Source

add sjisstat, a Shift-JIS character counter, useful for statistics on text files

vampirefrog 1 year ago
parent
commit
045c182939
2 changed files with 62 additions and 0 deletions
  1. 3
    0
      Makefile
  2. 59
    0
      sjisstat.c

+ 3
- 0
Makefile View File

@@ -24,6 +24,9 @@ xdump: xdump.o tools.o
24 24
 sjis2utf8: sjis2utf8.o sjis.o sjis_unicode.o utf8.o
25 25
 	gcc $^ -o $@
26 26
 
27
+sjisstat: sjisstat.o sjis.o sjis_unicode.o utf8.o
28
+	gcc $^ -o $@
29
+
27 30
 test-mem: test-mem.o v68.o v68human.o v68opm.o v68io.o v68doscall.o v68fecall.o v68iocscall.o okim6258.o ym2151.o vgm.o sjis.o sjis_unicode.o $(MUSASHIOBJS) $(MUSASHIGENOBJS)
28 31
 	gcc $^ -lao -lm -o $@
29 32
 

+ 59
- 0
sjisstat.c View File

@@ -0,0 +1,59 @@
1
+#include <stdint.h>
2
+#include <stdio.h>
3
+#include <stdlib.h>
4
+#include "sjis.h"
5
+
6
+struct charstat {
7
+	int chr, cnt;
8
+};
9
+
10
+struct charstat stats[65536];
11
+
12
+void push_charstat(int chr) {
13
+	chr &= 0xffff;
14
+	stats[chr].chr = chr;
15
+	stats[chr].cnt++;
16
+}
17
+
18
+int cmpstatfn(const void *p1, const void *p2) {
19
+	return ((struct charstat *)p2)->cnt - ((struct charstat *)p1)->cnt;
20
+}
21
+
22
+int main(int argc, char **argv) {
23
+	FILE *in = stdin;
24
+
25
+	uint8_t buf[1024];
26
+	int last_byte = 0;
27
+	int l;
28
+	while((l = fread(buf, 1, 1024, in)) > 0) {
29
+		for(int i = 0; i < l; i++) {
30
+			uint8_t b = buf[i];
31
+			if(last_byte == 0 && SJIS_FIRST_CHAR(b)) {
32
+				last_byte = b;
33
+			} else {
34
+				push_charstat((last_byte << 8) | b);
35
+				last_byte = 0;
36
+			}
37
+		}
38
+	}
39
+
40
+	qsort(stats, sizeof(stats) / sizeof(stats[0]), sizeof(stats[0]), cmpstatfn);
41
+
42
+	for(int i = 0; i < 65536; i++) {
43
+		if(stats[i].cnt == 0) break;
44
+		int chr = stats[i].chr;
45
+		char utfbuf[5];
46
+		if(chr < 0x20) {
47
+			snprintf(utfbuf, 5, "0x%02x", chr);
48
+		} else {
49
+			int l = utf8_encode(sjis_char_to_unicode(chr), utfbuf);
50
+			utfbuf[l] = 0;
51
+		}
52
+		if(chr > 0xff)
53
+			printf("0x%02x 0x%02x %d %s\n", chr >> 8, chr & 0xff, stats[i].cnt, utfbuf);
54
+		else
55
+			printf("     0x%02x %d %s\n", chr & 0xff, stats[i].cnt, utfbuf);
56
+	}
57
+
58
+	return 0;
59
+}

Loading…
Cancel
Save