#! /bin/bash ## Lists all nonASCII UTF-8 characters contained in the data, for each ## character it gives the number of occurences in each file and an ## example. ## Author: E. Choroba export LC_ALL=C codes=() for code in c{{0..9},{a..f}} d{{0..9},{a..f}} ; do codes+=($(eval grep -ho "$'\x$code'". "$@" | sort -u)) done for code in e{{0..9},{a..f}} ; do codes+=($(eval grep -ho "$'\x$code'".. "$@" | sort -u)) done for code in f{0..4} ; do codes+=($(eval grep -ho "$'\x$code'"... "$@" | sort -u)) done for code in "${codes[@]}" ; do hexdump <<< "$code" | sed '2d;s=000a==;s= 0a==' done \ | cut -f2 -d' ' \ | sed '/^....$/s=\(..\)\(..\)=\\x\2\\x\1=; /^......$/s=\(..\)\(..\)\(..\)=\\x\2\\x\1\\x\3=' \ | while read -r code ; do echo $code eval grep -c "$'$code'" "$@" eval grep -m1 --color=always "$'$code'" "$@" done