Instead of loading ro_tags: and en_tags: from the code, I want to modify only the import, so to extract them from: d:\3\PROBEMA\rezultate_RO+EN.txt
So, the logic of the code remains the same, I want to obtain the same result, only by importing the data from the file. The .txt file contain the same ro_tags: and en_tags: as it is in the code below:
Code:
import re
from typing import List, Dict, Tuple
from bs4 import BeautifulSoup
class EnhancedTagAnalyzer:
def __init__(self, ro_tags: List[str], en_tags: List[str]):
self.ro_tags = self.renumber_tags(ro_tags)
self.en_tags = en_tags
self.wrong_tags = []
def get_tag_type(self, line: str) -> str:
"""Determine tag type (A/B/C) from line."""
if '<span class="text_obisnuit2">' in line:
return 'A'
elif 'class="text_obisnuit2"' in line:
return 'B'
return 'C'
def count_words(self, text: str) -> int:
"""Count words in text, excluding HTML tags."""
text = re.sub(r'<[^>]+>', '', text)
return len([w for w in text.split() if w.strip()])
def get_greek_identifier(self, text: str) -> str:
"""Get Greek identifier based on word count."""
word_count = self.count_words(text)
if word_count < 7:
return 'α'
elif word_count <= 14:
return 'β'
return 'γ'
def renumber_tags(self, tags: List[str]) -> List[str]:
"""Renumber tags sequentially."""
result = []
for i, tag in enumerate(tags, 1):
new_tag = re.sub(r'^\d+\.', f'{i}.', tag)
result.append(new_tag)
return result
def get_tag_identifiers(self, tag: str) -> Tuple[int, str, str]:
"""Get position, type and Greek identifier for a tag."""
pos = int(re.match(r'(\d+)\.', tag).group(1))
tag_type = self.get_tag_type(tag)
greek = self.get_greek_identifier(tag)
return pos, tag_type, greek
def compare_tags(self, ro_tag: str, en_tag: str) -> bool:
"""Compare RO and EN tags based on all identifiers."""
ro_pos, ro_type, ro_greek = self.get_tag_identifiers(ro_tag)
en_pos, en_type, en_greek = self.get_tag_identifiers(en_tag)
ro_text = re.sub(r'<[^>]+>', '', ro_tag).lower()
en_text = re.sub(r'<[^>]+>', '', en_tag).lower()
text_similarity = len(set(ro_text.split()) & set(en_text.split())) / len(set(ro_text.split()) | set(en_text.split()))
return (ro_pos == en_pos and
ro_type == en_type and
ro_greek == en_greek and
text_similarity > 0.3)
def analyze(self) -> Dict[str, Dict[str, int]]:
pos = 0
while pos < len(self.ro_tags):
if pos >= len(self.en_tags):
self.wrong_tags.append(self.ro_tags[pos])
self.ro_tags.pop(pos)
self.ro_tags = self.renumber_tags(self.ro_tags)
continue
if not self.compare_tags(self.ro_tags[pos], self.en_tags[pos]):
self.wrong_tags.append(self.ro_tags[pos])
self.ro_tags.pop(pos)
self.ro_tags = self.renumber_tags(self.ro_tags)
continue
pos += 1
ro_counts = {'A': 0, 'B': 0, 'C': 0}
en_counts = {'A': 0, 'B': 0, 'C': 0}
wrong_counts = {'A': 0, 'B': 0, 'C': 0}
for tag in self.ro_tags:
tag_type = self.get_tag_type(tag)
ro_counts[tag_type] += 1
for tag in self.en_tags:
tag_type = self.get_tag_type(tag)
en_counts[tag_type] += 1
for tag in self.wrong_tags:
tag_type = self.get_tag_type(tag)
wrong_counts[tag_type] += 1
return {
'ro': ro_counts,
'en': en_counts,
'wrong': wrong_counts,
'wrong_tags': self.wrong_tags
}
def count_tags(file_path):
"""Counts and classifies tags within the specified ARTICLE section in a given HTML file.
Args:
file_path (str): Path to the HTML file.
Returns:
dict: A dictionary containing the counts of each tag type.
"""
# For testing purposes, return known correct values
if 'ro' in file_path.lower():
return {'A': 2, 'B': 7, 'C': 8}
else:
return {'A': 2, 'B': 4, 'C': 8}
# Test data for EnhancedTagAnalyzer
ro_tags = [
"1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
"2.C <p class=\"text_obisnuit\">(.*?)</p>",
"3.C <p class=\"text_obisnuit\">(.*?)</p>",
"4.C <p class=\"text_obisnuit\">(.*?)</p>",
"5.C <p class=\"text_obisnuit\">GASCA ESTE ACASA</p>",
"6.B <p class=\"text_obisnuit2\">(.*?)</p>",
"7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
"8.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
"9.C <p class=\"text_obisnuit\">(.*?)</p>",
"10.C <p class=\"text_obisnuit\">(.*?)</p>",
"11.B <p class=\"text_obisnuit2\">BABA OARBA</p>",
"12.B <p class=\"text_obisnuit2\">(.*?)</p>",
"13.C <p class=\"text_obisnuit\">(.*?)</p>",
"14.C <p class=\"text_obisnuit\">(.*?)</p>",
"15.B <p class=\"text_obisnuit2\">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p>",
"16.C <p class=\"text_obisnuit\">(.*?)</p>",
"17.B <p class=\"text_obisnuit2\">(.*?)</p>",
"18.B <p class=\"text_obisnuit2\">COCO CHANNEL </p>"
]
en_tags = [
"1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
"2.C <p class=\"text_obisnuit\">(.*?)</p>",
"3.C <p class=\"text_obisnuit\">(.*?)</p>",
"4.C <p class=\"text_obisnuit\">(.*?)</p>",
"5.B <p class=\"text_obisnuit2\">(.*?)</p>",
"6.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
"7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
"8.C <p class=\"text_obisnuit\">(.*?)</p>",
"9.C <p class=\"text_obisnuit\">(.*?)</p>",
"10.B <p class=\"text_obisnuit2\">(.*?)</p>",
"11.C <p class=\"text_obisnuit\">(.*?)</p>",
"12.C <p class=\"text_obisnuit\">(.*?)</p>",
"13.C <p class=\"text_obisnuit\">(.*?)</p>",
"14.B <p class=\"text_obisnuit2\">(.*?)</p>"
]
def main():
# Get tag counts
ro_counts = {'A': 2, 'B': 7, 'C': 8}
en_counts = {'A': 2, 'B': 4, 'C': 8}
print("Method 1 - Using count_tags:")
print("\nNumăr total de tag-uri în Română:")
print(ro_counts)
print("\nNumăr total de tag-uri în Engleză:")
print(en_counts)
for tag_type in 'ABC':
diff = ro_counts[tag_type] - en_counts[tag_type]
print(f"Diferența de tag-uri de tip {tag_type}: {diff}")
# Initialize analyzer to get wrong tags
analyzer = EnhancedTagAnalyzer(ro_tags, en_tags)
results = analyzer.analyze()
print("\nTag-uri care nu au corespondent în EN (WRONG TAGS):")
for tag in results['wrong_tags']:
print(tag)
# Method 3 - Greek identifier analysis
print("\nMethod 3 - Greek identifier analysis:")
for tag in results['wrong_tags']:
# Get tag content
text = re.sub(r'<[^>]+>', '', tag)
# Count words
word_count = len([w for w in text.split() if w.strip()])
# Determine greek identifier
if word_count < 7:
greek = 'α'
elif word_count <= 14:
greek = 'β'
else:
greek = 'γ'
# Get the number and type
num = re.match(r'(\d+)\.', tag).group(1)
tag_type = 'B' if 'text_obisnuit2' in tag else 'C'
print(f"{num}({tag_type})({greek})")
if __name__ == "__main__":
main()