;;; -*- coding: iso-2022-7bit -*- ;;; utf-8m.el --- modified UTF-8 encoding for Mac OS X hfs plus volume format ;; Copyright (C) 2004-2005 Seiji Zenitani ;; Author: Seiji Zenitani ;; Version: v20050805 ;; Keywords: mac, multilingual, Unicode, UTF-8 ;; Created: 2004-02-20 ;; Compatibility: Mac OS X (Carbon Emacs) ;; URL(jp): http://home.att.ne.jp/alpha/z123/emacs-mac-j.html ;; URL(en): http://home.att.ne.jp/alpha/z123/elisp-e.html ;; This file is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; This file is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;;; Commentary: ;; This package provides a modified utf-8 encoding (utf-8m) for Mac OSX ;; hfs plus volume format. By setting utf-8m as the file-name-coding-system, ;; emacs can read the following characters in filenames. ;; ;; * Japanese Kana characters with Dakuten/Han-Dakuten signs ;; * Latin characters with diacritical marks (accents, umlauts, tilde, etc.) ;; ;; Note that utf-8m does not restore the above characters when ;; it exports the filenames. Fortunately, it seems that the filesystem ;; knows how to deal with such invalid filenames. ;; ;; In order to use, add the below line to your .emacs file. ;; ;; (set-file-name-coding-system 'utf-8m) ;; ;;; utf-8m について ;; Mac OS X の HFS+ ファイルシステムのファイル名を読むための ;; 修正 UTF8 エンコーディング (utf-8m) を提供します。 ;; ファイル名を読み込む際に正規化方式を変更するので ;; 日本語の濁点・半濁点文字とアクセント付きのラテン文字が ;; 文字化けしないようになります。ファイル名を書き出す際の変換は ;; 考慮していませんが、ファイルシステム側がうまく処理してくれるようです。 ;; utf-8m を使用するためには、このファイルを読み込んだのち、 ;; ;; (set-file-name-coding-system 'utf-8m) ;; ;; として下さい。 ;;; Code: ;; Japanese Kana characters with Dakuten/Han-Dakuten signs (defvar utf-8m-fix-kana1-alist '( ?か ?き ?く ?け ?こ ?さ ?し ?す ?せ ?そ ?た ?ち ?つ ?て ?と ?は ?ひ ?ふ ?へ ?ほ ?カ ?キ ?ク ?ケ ?コ ?サ ?シ ?ス ?セ ?ソ ?タ ?チ ?ツ ?テ ?ト ?ハ ?ヒ ?フ ?ヘ ?ホ ?ヽ ?ゝ )) (defvar utf-8m-fix-kana2-alist '( ?は ?ひ ?ふ ?へ ?ほ ?ハ ?ヒ ?フ ?ヘ ?ホ )) ;; (defvar utf-8m-fix-kana3-alist ;; '(?ワ ?ヰ ?ヱ ?ヲ )) (defun utf-8m-post-read-kana-conversion (length) "Document forthcoming..." (save-excursion (while (not (eobp)) (let ((ch1 (char-before)) (ch2 (char-after))) (cond ((= ch2 302969) (cond ((memq ch1 utf-8m-fix-kana1-alist) (delete-char -1) (delete-char 1) (insert (+ ch1 1)) (setq length (- length 1)) ) ;; ((memq ch1 utf-8m-fix-kana3-alist) ;; (delete-char -1) ;; (delete-char 1) ;; (insert (+ ch1 1244)) ;; (setq length (- length 1)) ;; ) ((= ch1 ?ウ) (delete-char -1) (delete-char 1) (insert ?ヴ) (setq length (- length 1)) ))) ((= ch2 302970) (cond ((memq ch1 utf-8m-fix-kana2-alist) (delete-char -1) (delete-char 1) (insert (+ ch1 2)) (setq length (- length 1)) )))) (if (not (eobp))(forward-char)) ))) length) ;; Latin characters with diacritical marks (defvar utf-8m-fix-latin-alist '( (332480 . ( ;; grave (?A . ?,A@) (?E . ?,AH) (?I . ?,AL) (?O . ?,AR) (?U . ?,AY) (?a . ?,A`) (?e . ?,Ah) (?i . ?,Al) (?o . ?,Ar) (?u . ?,Ay) )) (332481 . ( ;; acute (?A . ?,AA) (?E . ?,AI) (?I . ?,AM) (?O . ?,AS) (?U . ?,AZ) (?a . ?,Aa) (?e . ?,Ai) (?i . ?,Am) (?o . ?,As) (?u . ?,Az) )) (332482 . ( ;; circumflex (?A . ?,AB) (?E . ?,AJ) (?I . ?,AN) (?O . ?,AT) (?U . ?,A[) (?a . ?,Ab) (?e . ?,Aj) (?i . ?,An) (?o . ?,At) (?u . ?,A{) )) (332483 . ( ;; tilda (?A . ?,bC) (?N . ?,bQ) (?O . ?,bU) (?a . ?,bc(B) (?n . ?,bq(B) (?o . ?,bu(B) )) (332488 . ( ;; umlaut (?A . ?,AD) (?E . ?,AK) (?I . ?,AO) (?O . ?,AV) (?U . ?,A\) (?a . ?,Ad) (?e . ?,Ak) (?i . ?,Ao) (?o . ?,Av) (?u . ?,A|) (?y . ?,A) )) (332490 . ( ;; angstrom (?A . ?,AE) (?a . ?,Ae) )) (332519 . ( ;; cedille (?C . ?,bG) (?c . ?,bg(B) )) )) (defun utf-8m-post-read-latin-conversion (length) "Document forthcoming..." (save-excursion (dotimes (i length) (forward-char)) (let ((accent_char nil) (accent_count 0)) (while (not (bobp)) (let ((ch (char-before))) (cond ((and (= accent_count 1) (assoc accent_char utf-8m-fix-latin-alist) (assoc ch (cdr (assoc accent_char utf-8m-fix-latin-alist))) ) (delete-char -1) (delete-char 1) (insert (cdr (assoc ch (cdr (assoc accent_char utf-8m-fix-latin-alist)) ))) (setq length (- length 1)) (setq accent_count 0) ) ((assoc ch utf-8m-fix-latin-alist) (setq accent_char ch) (setq accent_count (+ accent_count 1)) ) (t (setq accent_count 0)) ) (if (not (bobp))(backward-char)) ) ))) length) ;; ;; Korean Hangul characters ;; ;; ref. http://www.unicode.org/reports/tr15/#Hangul ;; (defun utf-8m-post-read-hangul-conversion (length) ;; "Document forthcoming..." ;; (save-excursion ;; (let* ((ch1 nil) ;; (ch2 nil) ;; (sbase #xac00) ;; (lbase #x1100) ;; (vbase #x1161) ;; (tbase #x11a7) ;; (lcount 19) ;; (vcount 21) ;; (tcount 28) ;; (ncount (* vcount tcount)) ; 588 ;; (scount (* lcount ncount)) ; 11172 ;; (lindex nil) ;; (vindex nil) ;; (sindex nil) ;; (tindex nil)) ;; (setq ch1 (char-to-ucs (char-after))) ;; ; (setq ch1 (decode-char (char-after) 'ucs)) ;; ; (setq ch1 (encode-char (char-after) 'ucs)) ;; ; (setq ch1 (mule-unicode-xxxx-to-ucs (char-after))) ;; (if (not (eobp))(forward-char)) ;; (while (not (eobp)) ;; (setq ch2 (char-to-ucs (char-after))) ;; ; (setq ch2 (encode-char (char-after) 'ucs)) ;; ; (setq ch2 (decode-char (char-after) 'ucs)) ;; ; (setq ch2 (mule-unicode-xxxx-to-ucs (char-after))) ;; ; (message "ch1:%X ch2:%X" ch1 ch2) ;; (setq lindex (- ch1 lbase)) ;; (setq vindex (- ch2 vbase)) ;; (setq sindex (- ch1 sbase)) ;; (setq tindex (- ch2 tbase)) ;; (if (and (>= lindex 0)(< lindex lcount) ;; (>= vindex 0)(< vindex vcount)) ;; (progn ;; ; (message "first loop") ;; (setq ch1 (+ sbase (* (+ (* lindex vcount) vindex) tcount))) ;; (delete-char -1) ;; (delete-char 1) ;; ; (insert-ucs-character ch1) ;; (ucs-insert ch1) ;; (setq length (- length 1)) ;; ) ;; (if (and (>= sindex 0)(< sindex scount) ;; (= (% sindex tcount) 0) ;; (>= tindex 0)(< tindex tcount)) ;; (progn ;; ; (message "second loop") ;; (setq ch1 (+ ch1 tindex)) ;; (delete-char -1) ;; (delete-char 1) ;; ; (insert-ucs-character ch1) ;; (ucs-insert ch1) ;; (setq length (- length 1)) ;; ) ;; (progn ;; (setq ch1 ch2) ;; (if (not (eobp))(forward-char)) ;; ) ;; )) ;; ))) ;; length) ;; use the above functions as post-read-converter (defun utf-8m-post-read-conversion (length) "Document forthcoming..." (save-excursion (setq length (utf-8-post-read-conversion length))) (save-excursion (setq length (utf-8m-post-read-kana-conversion length))) ;; (save-excursion ;; (setq length (utf-8m-post-read-hangul-conversion length))) (save-excursion (setq length (utf-8m-post-read-latin-conversion length))) length) ;; defines the coding system (utf-8m) (make-coding-system 'utf-8m 4 ?u "modified UTF-8 encoding for Mac OS X hfs plus volume format." '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) `((safe-charsets ascii eight-bit-control eight-bit-graphic latin-iso8859-1 mule-unicode-0100-24ff mule-unicode-2500-33ff mule-unicode-e000-ffff ,@(if utf-translate-cjk-mode utf-translate-cjk-charsets)) (mime-charset . utf-8) (coding-category . coding-category-utf-8) (valid-codes (0 . 255)) (pre-write-conversion . utf-8-pre-write-conversion) (post-read-conversion . utf-8m-post-read-conversion) (translation-table-for-encode . utf-translation-table-for-encode) (dependency unify-8859-on-encoding-mode unify-8859-on-decoding-mode utf-fragment-on-decoding utf-translate-cjk-mode))) ;; (set-file-name-coding-system 'utf-8m) (provide 'utf-8m) ;; utf-8m.el ends here.