2012-01-18 :-(
_ tdiary grep.rb で invalid multibyte character (1)
青木さんのアレ http://i.loveruby.net/svn/public/tdiarytools/trunk/grep.rb
% ruby --version ruby 1.9.2p180 (2011-02-18 revision 30909) [i386-netbsdelf]
UTF-8 対応と ruby 1.9 対応してみる。あといわゆるデバッグprint を追加。
--- C:/Users/rin/Desktop/grep.rb.orig Wed Jan 18 23:21:35 2012
+++ C:/Users/rin/Desktop/grep.rb Wed Jan 18 23:19:57 2012
@@ -1,4 +1,5 @@
#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
#
# $Id$
#
@@ -26,7 +27,7 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="ja-JP">
<head>
- <meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta http-equiv="Content-Language" content="ja-JP">
<meta name="robots" content="none">
<title>tDiary Grep</title>
@@ -115,12 +116,12 @@
begin
Uconv.u8toeuc(str)
rescue Uconv::Error
- NKF::nkf('-e -m0', str)
+ NKF::nkf('-w -m0', str)
end
end
rescue LoadError
def to_euc(str)
- NKF::nkf('-e -m0', str)
+ NKF::nkf('-w -m0', str)
end
end
@@ -135,10 +136,10 @@
Z_SPACE = "\241\241" # zen-kaku space
-BEGIN { $defout.binmode }
+BEGIN { $stdout.binmode }
def main
- $KCODE = 'EUC'
+# $KCODE = 'UTF8'
cgi = CGI.new
html = '<html><head><title></title></head><body><p>error</p></body></html>'
begin
@@ -154,11 +155,15 @@
begin
begin
if LOGGING and File.file?(query_log()) and cgi.valid?('history')
+ puts "history_page"
return history_page()
elsif not cgi.valid?('q')
+ puts "search_form_page()"
return search_form_page()
else
- query = to_euc([cgi.params['q']].compact.flatten.join(' '))
+ puts "else"
+# query = to_euc([cgi.params['q']].compact.flatten.join(' '))
+ query = [cgi.params['q']].compact.flatten.join(' ')
html = search_result_page(setup_patterns(query))
save_query(query, query_log()) if LOGGING
return html
@@ -183,7 +188,7 @@
def send_html(cgi, html)
print cgi.header('status' => '200 OK',
'type' => 'text/html',
- 'charset' => 'euc-jp',
+ 'charset' => 'UTF-8',
'Content-Length' => html.length.to_s,
'Cache-Control' => 'no-cache',
'Pragma' => 'no-cache')
@@ -191,9 +196,10 @@
end
def setup_patterns(query)
+ puts "setup_patterns"
patterns = split_string(query).map {|pat|
check_pattern pat
- /#{Regexp.quote(pat)}/ie
+ /#{Regexp.quote(pat)}/iu
}
raise WrongQuery, 'no pattern' if patterns.empty?
raise WrongQuery, 'too many sub patterns' if patterns.length > 8
@@ -201,6 +207,7 @@
end
def check_pattern(pat)
+ puts "check_pattern"
raise WrongQuery, 'no pattern' unless pat
raise WrongQuery, 'empty pattern' if pat.empty?
raise WrongQuery, "pattern too short: #{pat}" if pat.length < 2
@@ -208,7 +215,11 @@
end
def split_string(str)
- str.split(/[\s#{Z_SPACE}]+/oe).reject {|w| w.empty? }
+ puts "split_string"
+ puts NKF.guess(str)
+ puts str.encoding
+ puts __ENCODING__
+ str.split(/[\s#{Z_SPACE}]+/ou).reject {|w| w.empty? }
end
def save_query(query, file)
@@ -247,6 +258,7 @@
end
def search_result_page(patterns)
+ puts "search_result_page"
ERB.new(HEADER + SEARCH_RESULT + FOOTER).result(binding())
end
@@ -398,7 +410,7 @@
title, body = @source.split(/\n/, 2)
sprintf('%-30s | %s',
title.to_s.strip,
- remove_tags(body.to_s).gsub(/[\s#{Z_SPACE}]+/oe, ' ').slice(/\A.{0,60}/me))
+ remove_tags(body.to_s).gsub(/[\s#{Z_SPACE}]+/ou, ' ').slice(/\A.{0,60}/me))
end
private
コマンドラインから実行。
% cd ~/public_html/diary % ./grep.rb (offline mode: enter name=value pairs on standard input) q=hoge[Ctrl+D] <==== クエリ else setup_patterns split_string <==== split_string() まで来てる US-ASCII <==== NKF.guess(str) UTF-8 <==== str.encoding UTF-8 <==== __ENCODING__ Status: 200 OK Content-Type: text/html; charset=UTF-8 Content-Length: 362 Cache-Control: no-cache Pragma: no-cache <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html lang="ja-JP"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta http-equiv="Content-Language" content="ja-JP"> <meta name="robots" content="none"> <title>tDiary Grep</title> </head> <body> <pre> q=hoge invalid multibyte character <==== 例外 </pre> </body> </html>
machu さんところなど読んだけど力尽きた。
[ツッコミを入れる]




