<?xml version='1.0' encoding='UTF-8'?>
<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.openarchives.org/OAI/2.0/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
  <responseDate>2026-04-12T12:49:26Z</responseDate>
  <request verb="GetRecord" metadataPrefix="olac" identifier="2c7256ade14b11ee8822cf5d819ab78b9cb61168200644faa3b9aae58f95c3c0">https://metashare.ut.ee/oai_pmh/</request>
  <GetRecord>
    <record>
      <header>
        <identifier>2c7256ade14b11ee8822cf5d819ab78b9cb61168200644faa3b9aae58f95c3c0</identifier>
        <datestamp>2024-03-15T16:16:31Z</datestamp>
        <setSpec>corpus</setSpec>
        <setSpec>corpus:text</setSpec>
      </header>
      <metadata>
        <olac:olac xmlns:olac="http://www.language-archives.org/OLAC/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.language-archives.org/OLAC/1.1/dc.xsd http://purl.org/dc/terms/ http://www.language-archives.org/OLAC/1.1/dcterms.xsd http://www.language-archives.org/OLAC/1.1/ http://www.language-archives.org/OLAC/1.1/olac.xsd">
          <dc:title xml:lang="en">SynEst (English-to-Estonian) Synthetic Estonian Parallel Corpus</dc:title>
          <dc:description xml:lang="en">Synthetic parallel corpus with original English texts, machine-translated into Estonian and filtered.

Original English text sources:
- NewsCrawl (https://data.statmt.org/news-crawl) up to year 2021
- ParaCrawl v9 (https://paracrawl.eu): the English side of parallel corpora between English and German, Spanish, Finnish, French, Lithuanian, Latvian, Russian, Swedish, Ukrainian and Chinese
- United Nations Parallel Corpus (https://conferences.unite.un.org/uncorpus)
- OpenSubtitles (https://opus.nlpl.eu) monolingual English texts

Additional unfiltered data (not included in count):
- Reddit data (downloaded via https://github.com/microsoft/DialoGPT) in English</dc:description>
          <dcterms:alternative xml:lang="en">SynEst-en-to-et</dcterms:alternative>
          <dc:identifier xsi:type="dcterms:URI">https://doi.org/10.15155/5R1E-6R35</dc:identifier>
          <dc:language xsi:type="olac:language" olac:code="et">Estonian</dc:language>
          <dc:language xsi:type="olac:language" olac:code="en">English</dc:language>
          <dc:type xsi:type="olac:linguistic-type" olac:code="primary_text"/>
          <dc:subject>language resources, bilingual corpus</dc:subject>
          <dc:type xsi:type="dcterms:DCMIType">Text</dc:type>
          <dcterms:license>
	CC-BY
	</dcterms:license>
          <dcterms:rightsHolder>IPR Holder: Tartu Ülikool</dcterms:rightsHolder>
          <dcterms:extent>768250602 sentences</dcterms:extent>
          <dcterms:medium>downloadable</dcterms:medium>
          <dc:contributor xsi:type="olac:role" olac:code="depositor">Mark Fišel, fishel[at]ut.ee, Tartu Ülikool</dc:contributor>
        </olac:olac>
      </metadata>
    </record>
  </GetRecord>
</OAI-PMH>
