Skip to content

Commit 646ac51

Browse files
committed
Add: translation normalization
1 parent 1846d73 commit 646ac51

15 files changed

Lines changed: 912 additions & 2 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# webgate.pro
22

33
[![CI](https://github.com/WebgateSystems/webgate.pro/actions/workflows/rubyonrails.yml/badge.svg)](https://github.com/WebgateSystems/webgate.pro/actions/workflows/rubyonrails.yml)
4-
![Coverage](https://img.shields.io/badge/coverage-95.1%25-brightgreen)
4+
![Coverage](https://img.shields.io/badge/coverage-95.2%25-brightgreen)
55
[![Ruby](https://img.shields.io/badge/Ruby-3.2.2-CC342D?logo=ruby&logoColor=white)](https://www.ruby-lang.org/)
66
[![Rails](https://img.shields.io/badge/Rails-7.0.10-D30001?logo=rubyonrails&logoColor=white)](https://rubyonrails.org/)
77
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)

app/helpers/application_helper.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ def compare_path(menu_item)
2323
end
2424

2525
def main_menu_path(menu_item)
26-
URI.escape(menu_item.name.mb_chars.downcase.to_s)
26+
name = menu_item&.name
27+
slug_source = name.presence || menu_item&.altlink.presence || ''
28+
URI.escape(slug_source.to_s.mb_chars.downcase.to_s)
2729
end
2830

2931
def menu_item_active?(menu_item)
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# frozen_string_literal: true
2+
3+
require 'json'
4+
require 'net/http'
5+
require 'openssl'
6+
require 'uri'
7+
8+
class GptTranslationRepairService
9+
class Error < StandardError; end
10+
11+
ENDPOINT = 'https://api.openai.com/v1/chat/completions'
12+
MODEL = 'gpt-4o-mini'
13+
14+
def initialize(api_key: Settings.gpt_key)
15+
@api_key = api_key
16+
end
17+
18+
# Returns repaired HTML (string) in target language, with styling removed.
19+
def call(base_html:, base_locale:, current_target_html:, target_locale:)
20+
ensure_api_key!
21+
22+
target_locale = normalize_locale!(target_locale, label: 'target')
23+
base_locale = normalize_locale!(base_locale, label: 'base')
24+
25+
base_html = base_html.to_s
26+
current_target_html = current_target_html.to_s
27+
28+
prompt = build_prompt(base_html:, base_locale:, current_target_html:, target_locale:)
29+
raw = chat(prompt)
30+
parsed = parse_json(raw)
31+
html = parsed.fetch('html')
32+
33+
HtmlTranslationNormalizer.call(html)
34+
end
35+
36+
private
37+
38+
def ensure_api_key!
39+
raise Error, 'Missing Settings.gpt_key' if @api_key.blank?
40+
end
41+
42+
def normalize_locale!(locale, label:)
43+
value = locale.to_s
44+
raise Error, "Invalid #{label} locale: #{value}" if value.blank?
45+
46+
value
47+
end
48+
49+
def build_prompt(base_html:, base_locale:, current_target_html:, target_locale:)
50+
<<~PROMPT
51+
You are a professional translator and HTML cleaner.
52+
53+
Task:
54+
- Target language: #{target_locale.upcase}
55+
- You will receive:
56+
(A) Source HTML in #{base_locale.upcase} (source of truth)
57+
(B) Current #{target_locale.upcase} HTML (may be wrong language, may contain inline styles/classes)
58+
- Output MUST be a JSON object with exactly one key: "html".
59+
- Value of "html" MUST be valid HTML and MUST contain ZERO inline styles and ZERO styling attributes:
60+
remove all style="", class="", id="" attributes.
61+
- Preserve structure and tags, but remove redundant wrapper spans if needed.
62+
- If (B) is already in the correct target language, keep its meaning but normalize/clean the HTML.
63+
- If (B) is NOT in the correct target language, translate from (A) to #{target_locale.upcase} and output cleaned HTML.
64+
- Do not output markdown. Do not include explanations.
65+
66+
Source HTML (A):
67+
#{base_html}
68+
69+
Current target HTML (B):
70+
#{current_target_html}
71+
PROMPT
72+
end
73+
74+
def chat(prompt)
75+
uri = URI.parse(ENDPOINT)
76+
http = build_http(uri)
77+
req = build_request(uri, prompt)
78+
res = http.request(req)
79+
parse_chat_response(res)
80+
rescue JSON::ParserError => e
81+
raise Error, "OpenAI JSON parse error: #{e.message}"
82+
end
83+
84+
def build_http(uri)
85+
http = Net::HTTP.new(uri.host, uri.port)
86+
http.use_ssl = true
87+
# Some servers fail SSL verification due to missing CRL/CA chain (e.g. "unable to get certificate CRL").
88+
# For this internal task we intentionally disable verification.
89+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
90+
http
91+
end
92+
93+
def build_request(uri, prompt)
94+
req = Net::HTTP::Post.new(uri.request_uri)
95+
req['Authorization'] = "Bearer #{@api_key}"
96+
req['Content-Type'] = 'application/json'
97+
req.body = JSON.dump(build_payload(prompt))
98+
req
99+
end
100+
101+
def build_payload(prompt)
102+
{
103+
model: MODEL,
104+
temperature: 0.2,
105+
messages: [
106+
{ role: 'system', content: 'Return JSON only.' },
107+
{ role: 'user', content: prompt }
108+
]
109+
}
110+
end
111+
112+
def parse_chat_response(res)
113+
raise Error, "OpenAI HTTP #{res.code}: #{res.body.to_s[0..300]}" unless res.is_a?(Net::HTTPSuccess)
114+
115+
body = JSON.parse(res.body)
116+
content = body.dig('choices', 0, 'message', 'content')
117+
raise Error, "OpenAI response missing content: #{res.body.to_s[0..300]}" if content.blank?
118+
119+
content
120+
end
121+
122+
def parse_json(text)
123+
JSON.parse(text)
124+
rescue JSON::ParserError
125+
# Try to extract JSON substring if model wrapped it
126+
extracted = text.to_s[/\{[\s\S]*\}/]
127+
raise Error, "Response is not valid JSON: #{text.to_s[0..300]}" if extracted.blank?
128+
129+
JSON.parse(extracted)
130+
end
131+
end
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# frozen_string_literal: true
2+
3+
class HtmlTranslationNormalizer
4+
# Removes inline styling and “styling attributes” while keeping semantic HTML.
5+
#
6+
# - Strips: style, class, id
7+
# - Unwraps <span> nodes that become attribute-less wrappers
8+
#
9+
# This is intentionally conservative: it does not remove tags like <strong>, <em>, <a>, etc.
10+
def self.call(html)
11+
return '' if html.blank?
12+
13+
s = html.to_s.dup
14+
15+
# Remove styling-related attributes (double or single quotes)
16+
s.gsub!(/\s+(style|class|id)=(["']).*?\2/i, '')
17+
18+
# If spans became plain wrappers after stripping attributes, unwrap them.
19+
# Also unwrap spans that were already attribute-less.
20+
s.gsub!(/<span\s*>/i, '')
21+
s.gsub!(%r{</span>}i, '')
22+
23+
s
24+
end
25+
end
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# frozen_string_literal: true
2+
3+
class TextTranslationNormalizer
4+
# Cleans common artifacts like:
5+
# - leading/trailing quotes (", ', “ ”, „)
6+
# - trailing stray quotes after newlines
7+
# - excessive whitespace/newlines
8+
#
9+
# It is conservative: it mainly trims and removes obvious wrapping/dangling quotes.
10+
def self.call(text)
11+
return '' if text.nil?
12+
13+
str = text.to_s
14+
str = str.tr("\u00A0", ' ') # nbsp
15+
str = str.strip
16+
str = str.gsub(/\r\n?/, "\n")
17+
str = str.gsub(/\n{3,}/, "\n\n")
18+
19+
str = remove_wrapping_quotes(str)
20+
str = remove_dangling_trailing_quotes(str)
21+
str.strip
22+
end
23+
24+
QUOTE_PAIRS = [
25+
['"', '"'],
26+
["'", "'"],
27+
['“', '”'],
28+
['„', '”']
29+
].freeze
30+
31+
def self.remove_wrapping_quotes(str)
32+
QUOTE_PAIRS.each do |open_q, close_q|
33+
next unless str.start_with?(open_q) && str.end_with?(close_q) && str.length >= 2
34+
35+
str = str[1..-2].strip
36+
end
37+
38+
str
39+
end
40+
41+
def self.remove_dangling_trailing_quotes(str)
42+
# Remove trailing quote chars if they appear as obvious artifacts at the very end
43+
str.sub(/\s*["'“”„]+\s*\z/, '')
44+
end
45+
46+
private_class_method :remove_wrapping_quotes, :remove_dangling_trailing_quotes
47+
end
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# rubocop:disable Metrics/BlockLength
2+
namespace :technologies do
3+
desc 'Clean technology translations (strip quotes, remove styles/classes/ids). ' \
4+
'Usage: rake technologies:cleanup_translations[technology_id,locale,dry_run]'
5+
task :cleanup_translations, %i[technology_id locale dry_run] => :environment do |_t, args|
6+
technology_id = args[:technology_id].presence
7+
target_locale = args[:locale].presence&.to_sym
8+
dry_run = args[:dry_run].to_s == 'true'
9+
10+
locales =
11+
if target_locale
12+
[target_locale]
13+
else
14+
I18n.available_locales
15+
end
16+
17+
scope = technology_id ? Technology.where(id: technology_id) : Technology.all
18+
total = scope.count
19+
20+
puts 'Starting cleanup of technology translations...'
21+
puts "Dry run: #{dry_run}"
22+
puts "Locales: #{locales.join(', ')}"
23+
puts ''
24+
25+
processed = 0
26+
changed = 0
27+
errors = 0
28+
29+
scope.find_each do |tech|
30+
processed += 1
31+
puts "[#{processed}/#{total}] Technology ID: #{tech.id} (#{tech.title})"
32+
33+
locales.each do |locale|
34+
I18n.with_locale(locale) do
35+
before_desc = tech.description.to_s
36+
before_link = tech.link.to_s
37+
38+
after_desc = TextTranslationNormalizer.call(HtmlTranslationNormalizer.call(before_desc))
39+
after_link = TextTranslationNormalizer.call(before_link)
40+
after_link = before_link if after_link.blank?
41+
42+
next if after_desc == before_desc && after_link == before_link
43+
44+
if dry_run
45+
changed += 1
46+
puts " - #{locale}: WOULD CLEAN"
47+
next
48+
end
49+
50+
t = tech.translations.find_or_initialize_by(locale: locale.to_s)
51+
t.description = after_desc
52+
t.link = after_link
53+
t.save!
54+
changed += 1
55+
puts " - #{locale}: cleaned"
56+
end
57+
rescue StandardError => e
58+
errors += 1
59+
puts " - #{locale}: ERROR #{e.class}: #{e.message}"
60+
end
61+
end
62+
63+
puts "\nDone. Processed: #{processed}, changed: #{changed}, errors: #{errors}"
64+
end
65+
end
66+
# rubocop:enable Metrics/BlockLength

0 commit comments

Comments
 (0)