commit ed58c540a2d55f34b5675942d91f49636e8e5199 Author: Alexander Yakovlev Date: Fri Nov 4 13:02:48 2016 +0700 Начало проекта diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..46b4d52 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +# This file is for unifying the coding style for different editors and IDEs +# editorconfig.org +root = true + +[*] +end_of_line = lf +charset = utf-8 +indent_style = space +indent_size = 2 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba79636 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +#*# +*~ +.DS_Store diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8769bdd --- /dev/null +++ b/.travis.yml @@ -0,0 +1,4 @@ +language: ruby +sudo: false + +script: "bundle exec rake test" diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..8da838f --- /dev/null +++ b/Gemfile @@ -0,0 +1,2 @@ +source 'https://rubygems.org' +gem 'yaml' diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bdfff7a --- /dev/null +++ b/LICENSE @@ -0,0 +1,118 @@ + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT + PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT + CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES + THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO + WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION + OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES + RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR + WORKS PROVIDED HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically +confer exclusive Copyright and Related Rights (defined below) upon the +creator and subsequent owner(s) (each and all, an "owner") of an +original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work +for the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without +fear of later claims of infringement build upon, modify, incorporate +in other works, reuse and redistribute as freely as possible in any +form whatsoever and for any purposes, including without limitation +commercial purposes. These owners may contribute to the Commons to +promote the ideal of a free culture and the further production of +creative, cultural and scientific works, or to gain reputation or +greater distribution for their Work in part through the use and +efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or +she is an owner of Copyright and Related Rights in the Work, +voluntarily elects to apply CC0 to the Work and publicly distribute +the Work under its terms, with knowledge of his or her Copyright and +Related Rights in the Work and the meaning and intended legal effect +of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may +be protected by copyright and related or neighboring rights +("Copyright and Related Rights"). Copyright and Related Rights +include, but are not limited to, the following: + + the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; moral rights retained by the + original author(s) and/or performer(s); publicity and privacy + rights pertaining to a person's image or likeness depicted in a + Work; rights protecting against unfair competition in regards to a + Work, subject to the limitations in paragraph 4(a), below; rights + protecting the extraction, dissemination, use and reuse of data in + a Work; database rights (such as those arising under Directive + 96/9/EC of the European Parliament and of the Council of 11 March + 1996 on the legal protection of databases, and under any national + implementation thereof, including any amended or successor version + of such directive); and other similar, equivalent or corresponding + rights throughout the world based on applicable law or treaty, and + any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in +contravention of, applicable law, Affirmer hereby overtly, fully, +permanently, irrevocably and unconditionally waives, abandons, and +surrenders all of Affirmer's Copyright and Related Rights and +associated claims and causes of action, whether now known or unknown +(including existing as well as future claims and causes of action), in +the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"Waiver"). Affirmer makes the Waiver for the benefit of each member of +the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal +or equitable action to disrupt the quiet enjoyment of the Work by the +public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any +reason be judged legally invalid or ineffective under applicable law, +then the Waiver shall be preserved to the maximum extent permitted +taking into account Affirmer's express Statement of Purpose. In +addition, to the extent the Waiver is so judged Affirmer hereby grants +to each affected person a royalty-free, non transferable, non +sublicensable, non exclusive, irrevocable and unconditional license to +exercise Affirmer's Copyright and Related Rights in the Work (i) in +all territories worldwide, (ii) for the maximum duration provided by +applicable law or treaty (including future time extensions), (iii) in +any current or future medium and for any number of copies, and (iv) +for any purpose whatsoever, including without limitation commercial, +advertising or promotional purposes (the "License"). The License shall +be deemed effective as of the date CC0 was applied by Affirmer to the +Work. Should any part of the License for any reason be judged legally +invalid or ineffective under applicable law, such partial invalidity +or ineffectiveness shall not invalidate the remainder of the License, +and in such case Affirmer hereby affirms that he or she will not (i) +exercise any of his or her remaining Copyright and Related Rights in +the Work or (ii) assert any associated claims and causes of action +with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, + abandoned, surrendered, licensed or otherwise affected by this + document. Affirmer offers the Work as-is and makes no + representations or warranties of any kind concerning the Work, + express, implied, statutory or otherwise, including without + limitation warranties of title, merchantability, fitness for a + particular purpose, non infringement, or the absence of latent or + other defects, accuracy, or the present or absence of errors, + whether or not discoverable, all to the greatest extent + permissible under applicable law. Affirmer disclaims + responsibility for clearing rights of other persons that may apply + to the Work or any use thereof, including without limitation any + person's Copyright and Related Rights in the Work. Further, + Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. Affirmer understands and acknowledges that Creative Commons + is not a party to this document and has no duty or obligation with + respect to this CC0 or use of the Work. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3ffac7d --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +#Rus_Corpora + +[![Проверка кода](https://travis-ci.org/Oreolek/rus_corpora.svg?branch=master)](https://travis-ci.org/Oreolek/rus_corpora) + +Это - коллекция корпусов текста, которые могут быть полезными для создания ботов, игр, каких-нибудь ещё странных вещей. + +Эти подборки могут помочь вам быстро сделать прототип. У этого проекта нет цели собрать лингвистически или статистически значимые корпусы. После того, как вы собрали *что-то* на этих текстах, вы можете просто расширить ваш корпус данными из других источников. + +##Лицензия + +Все данные в этом проекте - это общественное достояние. Если вам всё-таки нужна лицензия, то это Creative Commons Zero: + + + +Оригинальная идея собрать коллекцию корпусов — [Darius Kazemi](https://github.com/dariusk/corpora). + +##Чем не является этот проект + +* Он не заменяет полноценные API — у вас есть [Национальный корпус русского языка](http://ruscorpora.ru/) и API движка [MediaWiki](http://www.mediawiki.org/wiki/API:Main_page). + +* Он не заменяет полноценные лингвистические корпусы — здесь нет особых разметок. + +* Он содержит слова только русского языка. + +* У вас нет надёжных гарантий, что в текстах нет опечаток или повторений. Нам тоже важно следить за ошибками, но мы не отвечаем, если что-то пропустили. Не используйте этот проект для важных разработок. Если вы заметили ошибку, исправьте её сами в новом pull request или хотя бы сообщите нам через Задачи (Issues). + +##Чем является этот проект + +* Rus_Corpora - это сборник файлов [YAML](http://yaml.org/), который не зависит от конкретного языка программирования. Если вы хотите оформить его в модуль Ruby, NPM, composer, luarocks или pip - делайте отдельный проект. +* Rus_Corpora - это коллекция *маленьких* файлов. Каждый файл не должен описывать больше тысячи вещей. + +##Как внести вклад в коллекцию + +* КОГДА ВЫ ОТПРАВЛЯЕТЕ ДАННЫЕ В ЭТОТ ПРОЕКТ, ВЫ СОГЛАШАЕТЕСЬ С ИХ ПУБЛИКАЦИЕЙ ПОД ЛИЦЕНЗИЕЙ [CC0](http://creativecommons.org/publicdomain/zero/1.0/), ЧТО ОСВОБОЖДАЕТ ИХ НАВСЕГДА ДЛЯ ИСПОЛЬЗОВАНИЯ КЕМ УГОДНО ПО ЛЮБОЙ ПРИЧИНЕ БЕЗ УКАЗАНИЯ АВТОРСТВА. +* Пожалуйста, отправляйте данные в формате [YAML](https://www.opennet.ru/base/dev/yaml.txt.html) с расширением `.yml`. К репозиторию подключено тестирование Travis-CI, поэтому вы можете сразу увидеть, если ваш pull request содержит невалидный YAML или файл с повторениями. +* Не переусердствуйте. Держите в каждом файле не больше тысячи объектов. Объект может быть составным, так что это не означает тысячу строк, но где-то в районе пятитысячной строки можно начинать волноваться. +* У файлов нет минимального размера. Если вы считаете, что коллекция определённых слов или фраз точно понадобится другим людям, добавляйте её, даже если вы собрали всего два десятка фраз. Возможно, другие люди её расширят. diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..b85d4d6 --- /dev/null +++ b/Rakefile @@ -0,0 +1,5 @@ +task default: %w[test] + +task :test do + ruby "test.rb" +end diff --git a/data/наука/элементы.yml b/data/наука/элементы.yml new file mode 100644 index 0000000..5978ebb --- /dev/null +++ b/data/наука/элементы.yml @@ -0,0 +1,355 @@ +--- +- number: 1 + name: "Водород" + symbol: H +- number: 2 + name: "Гелий" + symbol: He +- number: 3 + name: "Литий" + symbol: Li +- number: 4 + name: "Бериллий" + symbol: Be +- number: 5 + name: "Бор" + symbol: B +- number: 6 + name: "Углерод" + symbol: C +- number: 7 + name: "Азот" + symbol: N +- number: 8 + name: "Кислород" + symbol: O +- number: 9 + name: "Фтор" + symbol: F +- number: 10 + name: "Неон" + symbol: Ne +- number: 11 + name: "Натрий" + symbol: Na +- number: 12 + name: "Магний" + symbol: Mg +- number: 13 + name: "Алюминий" + symbol: Al +- number: 14 + name: "Кремний" + symbol: Si +- number: 15 + name: "Фосфор" + symbol: P +- number: 16 + name: "Сера" + symbol: S +- number: 17 + name: "Хлор" + symbol: Cl +- number: 18 + name: "Аргон" + symbol: Ar +- number: 19 + name: "Калий" + symbol: K +- number: 20 + name: "Кальций" + symbol: Ca +- number: 21 + name: "Скандий" + symbol: Sc +- number: 22 + name: "Титан" + symbol: Ti +- number: 23 + name: "Ванадий" + symbol: V +- number: 24 + name: "Хром" + symbol: Cr +- number: 25 + name: "Марганец" + symbol: Mn +- number: 26 + name: "Железо" + symbol: Fe +- number: 27 + name: "Кобальт" + symbol: Co +- number: 28 + name: "Никель" + symbol: Ni +- number: 29 + name: "Медь" + symbol: Cu +- number: 30 + name: "Цинк" + symbol: Zn +- number: 31 + name: "Галлий" + symbol: Ga +- number: 32 + name: "Германий" + symbol: Ge +- number: 33 + name: "Мышьяк" + symbol: As +- number: 34 + name: "Селен" + symbol: Se +- number: 35 + name: "Бром" + symbol: Br +- number: 36 + name: "Криптон" + symbol: Kr +- number: 37 + name: "Рубидий" + symbol: Rb +- number: 38 + name: "Стронций" + symbol: Sr +- number: 39 + name: "Иттрий" + symbol: Y +- number: 40 + name: "Цирконий" + symbol: Zr +- number: 41 + name: "Ниобий" + symbol: Nb +- number: 42 + name: "Молибден" + symbol: Mo +- number: 43 + name: "Технеций" + symbol: Tc +- number: 44 + name: "Рутений" + symbol: Ru +- number: 45 + name: "Родий" + symbol: Rh +- number: 46 + name: "Палладий" + symbol: Pd +- number: 47 + name: "Серебро" + symbol: Ag +- number: 48 + name: "Кадмий" + symbol: Cd +- number: 49 + name: "Индий" + symbol: In +- number: 50 + name: "Олово" + symbol: Sn +- number: 51 + name: "Сурьма" + symbol: Sb +- number: 52 + name: "Теллур" + symbol: Te +- number: 53 + name: "Иод" + symbol: I +- number: 54 + name: "Ксенон" + symbol: Xe +- number: 55 + name: "Цезий" + symbol: Cs +- number: 56 + name: "Барий" + symbol: Ba +- number: 57 + name: "Лантан" + symbol: La +- number: 58 + name: "Церий" + symbol: Ce +- number: 59 + name: "Празеодим" + symbol: Pr +- number: 60 + name: "Неодим" + symbol: Nd +- number: 61 + name: "Прометий" + symbol: Pm +- number: 62 + name: "Самарий" + symbol: Sm +- number: 63 + name: "Европий" + symbol: Eu +- number: 64 + name: "Гадолиний" + symbol: Gd +- number: 65 + name: "Тербий" + symbol: Tb +- number: 66 + name: "Диспрозий" + symbol: Dy +- number: 67 + name: "Гольмий" + symbol: Ho +- number: 68 + name: "Эрбий" + symbol: Er +- number: 69 + name: "Тулий" + symbol: Tm +- number: 70 + name: "Иттербий" + symbol: Yb +- number: 71 + name: "Лютеций" + symbol: Lu +- number: 72 + name: "Гафний" + symbol: Hf +- number: 73 + name: "Тантал" + symbol: Ta +- number: 74 + name: "Вольфрам" + symbol: W +- number: 75 + name: "Рений" + symbol: Re +- number: 76 + name: "Осмий" + symbol: Os +- number: 77 + name: "Иридий" + symbol: Ir +- number: 78 + name: "Платина" + symbol: Pt +- number: 79 + name: "Золото" + symbol: Au +- number: 80 + name: "Ртуть" + symbol: Hg +- number: 81 + name: "Таллий" + symbol: Tl +- number: 82 + name: "Свинец" + symbol: Pb +- number: 83 + name: "Висмут" + symbol: Bi +- number: 84 + name: "Полоний" + symbol: Po +- number: 85 + name: "Астат" + symbol: At +- number: 86 + name: "Радон" + symbol: Rn +- number: 87 + name: "Франций" + symbol: Fr +- number: 88 + name: "Радий" + symbol: Ra +- number: 89 + name: "Актиний" + symbol: Ac +- number: 90 + name: "Торий" + symbol: Th +- number: 91 + name: "Протактиний" + symbol: Pa +- number: 92 + name: "Уран" + symbol: U +- number: 93 + name: "Нептуний" + symbol: Np +- number: 94 + name: "Плутоний" + symbol: Pu +- number: 95 + name: "Америций" + symbol: Am +- number: 96 + name: "Кюрий" + symbol: Cm +- number: 97 + name: "Берклий" + symbol: Bk +- number: 98 + name: "Калифорний" + symbol: Cf +- number: 99 + name: "Эйнштейний" + symbol: Es +- number: 100 + name: "Фермий" + symbol: Fm +- number: 101 + name: "Менделевий" + symbol: Md +- number: 102 + name: "Нобелий" + symbol: false +- number: 103 + name: "Лоуренсий" + symbol: Lr +- number: 104 + name: "Резерфордий" + symbol: Rf +- number: 105 + name: "Дубний" + symbol: Db +- number: 106 + name: "Сиборгий" + symbol: Sg +- number: 107 + name: "Борий" + symbol: Bh +- number: 108 + name: "Хассий" + symbol: Hs +- number: 109 + name: "Мейтнерий" + symbol: Mt +- number: 110 + name: "Дармштадтий" + symbol: Ds +- number: 111 + name: "Рентгений" + symbol: Rg +- number: 112 + name: "Коперниций" + symbol: Cn +- number: 113 + name: "Нихоний" + symbol: Nh +- number: 114 + name: "Флеровий" + symbol: Fl +- number: 115 + name: "Московий" + symbol: Mc +- number: 116 + name: "Ливерморий" + symbol: Lv +- number: 117 + name: "Теннессин" + symbol: Ts +- number: 118 + name: "Оганесон" + symbol: Og diff --git a/test.rb b/test.rb new file mode 100644 index 0000000..0ec00fa --- /dev/null +++ b/test.rb @@ -0,0 +1,9 @@ +require 'yaml' + +Dir[File.expand_path('../data/*/*.yml', __FILE__)].each do |file| + puts "Checking #{file}\n" + text = YAML.load_file(file) + if text.uniq.length != text.length + raise "Файл #{file} содержит повторения!" + end +end